Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
Linus Torvalds [Fri, 19 Mar 2010 16:43:06 +0000 (09:43 -0700)]
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits)
  ceph: update for write_inode API change
  ceph: reset osd after relevant messages timed out
  ceph: fix flush_dirty_caps race with caps migration
  ceph: include migrating caps in issued set
  ceph: fix osdmap decoding when pools include (removed) snaps
  ceph: return EBADF if waiting for caps on closed file
  ceph: set osd request message front length correctly
  ceph: reset front len on return to msgpool; BUG on mismatched front iov
  ceph: fix snaptrace decoding on cap migration between mds
  ceph: use single osd op reply msg
  ceph: reset bits on connection close
  ceph: remove bogus mds forward warning
  ceph: remove fragile __map_osds optimization
  ceph: fix connection fault STANDBY check
  ceph: invalidate_authorizer without con->mutex held
  ceph: don't clobber write return value when using O_SYNC
  ceph: fix client_request_forward decoding
  ceph: drop messages on unregistered mds sessions; cleanup
  ceph: fix comments, locking in destroy_inode
  ceph: move dereference after NULL test
  ...

Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt

67 files changed:
Documentation/filesystems/ceph.txt [new file with mode: 0644]
Documentation/ioctl/ioctl-number.txt
MAINTAINERS
fs/Kconfig
fs/Makefile
fs/ceph/Kconfig [new file with mode: 0644]
fs/ceph/Makefile [new file with mode: 0644]
fs/ceph/README [new file with mode: 0644]
fs/ceph/addr.c [new file with mode: 0644]
fs/ceph/armor.c [new file with mode: 0644]
fs/ceph/auth.c [new file with mode: 0644]
fs/ceph/auth.h [new file with mode: 0644]
fs/ceph/auth_none.c [new file with mode: 0644]
fs/ceph/auth_none.h [new file with mode: 0644]
fs/ceph/auth_x.c [new file with mode: 0644]
fs/ceph/auth_x.h [new file with mode: 0644]
fs/ceph/auth_x_protocol.h [new file with mode: 0644]
fs/ceph/buffer.c [new file with mode: 0644]
fs/ceph/buffer.h [new file with mode: 0644]
fs/ceph/caps.c [new file with mode: 0644]
fs/ceph/ceph_debug.h [new file with mode: 0644]
fs/ceph/ceph_frag.c [new file with mode: 0644]
fs/ceph/ceph_frag.h [new file with mode: 0644]
fs/ceph/ceph_fs.c [new file with mode: 0644]
fs/ceph/ceph_fs.h [new file with mode: 0644]
fs/ceph/ceph_hash.c [new file with mode: 0644]
fs/ceph/ceph_hash.h [new file with mode: 0644]
fs/ceph/ceph_strings.c [new file with mode: 0644]
fs/ceph/crush/crush.c [new file with mode: 0644]
fs/ceph/crush/crush.h [new file with mode: 0644]
fs/ceph/crush/hash.c [new file with mode: 0644]
fs/ceph/crush/hash.h [new file with mode: 0644]
fs/ceph/crush/mapper.c [new file with mode: 0644]
fs/ceph/crush/mapper.h [new file with mode: 0644]
fs/ceph/crypto.c [new file with mode: 0644]
fs/ceph/crypto.h [new file with mode: 0644]
fs/ceph/debugfs.c [new file with mode: 0644]
fs/ceph/decode.h [new file with mode: 0644]
fs/ceph/dir.c [new file with mode: 0644]
fs/ceph/export.c [new file with mode: 0644]
fs/ceph/file.c [new file with mode: 0644]
fs/ceph/inode.c [new file with mode: 0644]
fs/ceph/ioctl.c [new file with mode: 0644]
fs/ceph/ioctl.h [new file with mode: 0644]
fs/ceph/mds_client.c [new file with mode: 0644]
fs/ceph/mds_client.h [new file with mode: 0644]
fs/ceph/mdsmap.c [new file with mode: 0644]
fs/ceph/mdsmap.h [new file with mode: 0644]
fs/ceph/messenger.c [new file with mode: 0644]
fs/ceph/messenger.h [new file with mode: 0644]
fs/ceph/mon_client.c [new file with mode: 0644]
fs/ceph/mon_client.h [new file with mode: 0644]
fs/ceph/msgpool.c [new file with mode: 0644]
fs/ceph/msgpool.h [new file with mode: 0644]
fs/ceph/msgr.h [new file with mode: 0644]
fs/ceph/osd_client.c [new file with mode: 0644]
fs/ceph/osd_client.h [new file with mode: 0644]
fs/ceph/osdmap.c [new file with mode: 0644]
fs/ceph/osdmap.h [new file with mode: 0644]
fs/ceph/pagelist.c [new file with mode: 0644]
fs/ceph/pagelist.h [new file with mode: 0644]
fs/ceph/rados.h [new file with mode: 0644]
fs/ceph/snap.c [new file with mode: 0644]
fs/ceph/super.c [new file with mode: 0644]
fs/ceph/super.h [new file with mode: 0644]
fs/ceph/types.h [new file with mode: 0644]
fs/ceph/xattr.c [new file with mode: 0644]

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644 (file)
index 0000000..6e03917
--- /dev/null
@@ -0,0 +1,139 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability.  No single points of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre.  Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons.  Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.).  File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs.  When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures.  The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads.  In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation.  The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system.  Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes.  That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes.  This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects.  (However, if the monitor you specify
+happens to be down, the mount won't succeed.)  The port can be left
+off if the monitor is using the default.  So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient.  If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+  ip=A.B.C.D[:N]
+       Specify the IP and/or port the client should bind to locally.
+       There is normally not much reason to do this.  If the IP is not
+       specified, the client's IP address is determined by looking at the
+       address it's connection to the monitor originates from.
+
+  wsize=X
+       Specify the maximum write size in bytes.  By default there is no
+       maximu.  Ceph will normally size writes based on the file stripe
+       size.
+
+  rsize=X
+       Specify the maximum readahead.
+
+  mount_timeout=X
+       Specify the timeout value for mount (in seconds), in the case
+       of a non-responsive Ceph file system.  The default is 30
+       seconds.
+
+  rbytes
+       When stat() is called on a directory, set st_size to 'rbytes',
+       the summation of file sizes over all files nested beneath that
+       directory.  This is the default.
+
+  norbytes
+       When stat() is called on a directory, set st_size to the
+       number of entries in that directory.
+
+  nocrc
+       Disable CRC32C calculation for data writes.  If set, the OSD
+       must rely on TCP's error correction to detect data corruption
+       in the data payload.
+
+  noasyncreaddir
+       Disable client's use its local cache to satisfy readdir
+       requests.  (This does not change correctness; the client uses
+       cached metadata only when a lease or capability ensures it is
+       valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+       http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+       git://ceph.newdream.net/linux-ceph-client.git
+
+and the source for the full system is at
+       git://ceph.newdream.net/ceph.git
index 35c9b51..dd5806f 100644 (file)
@@ -291,6 +291,7 @@ Code  Seq#(hex)     Include File            Comments
 0x92   00-0F   drivers/usb/mon/mon_bin.c
 0x93   60-7F   linux/auto_fs.h
 0x94   all     fs/btrfs/ioctl.h
+0x97   00-7F   fs/ceph/ioctl.h         Ceph file system
 0x99   00-0F                           537-Addinboard driver
                                        <mailto:buk@buks.ipn.de>
 0xA0   all     linux/sdp/sdp.h         Industrial Device Project
index 382eaa4..449d444 100644 (file)
@@ -1441,6 +1441,15 @@ F:       arch/powerpc/include/asm/spu*.h
 F:     arch/powerpc/oprofile/*cell*
 F:     arch/powerpc/platforms/cell/
 
+CEPH DISTRIBUTED FILE SYSTEM CLIENT
+M:     Sage Weil <sage@newdream.net>
+L:     ceph-devel@lists.sourceforge.net
+W:     http://ceph.newdream.net/
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+S:     Supported
+F:     Documentation/filesystems/ceph.txt
+F:     fs/ceph
+
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 M:     David Vrabel <david.vrabel@csr.com>
 L:     linux-usb@vger.kernel.org
index 7405f07..5f85b59 100644 (file)
@@ -235,6 +235,7 @@ config NFS_COMMON
 
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
index c3633aa..97f340f 100644 (file)
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS)              += ocfs2/
 obj-$(CONFIG_BTRFS_FS)         += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)          += ceph/
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644 (file)
index 0000000..04b8280
--- /dev/null
@@ -0,0 +1,27 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+       depends on INET && EXPERIMENTAL
+       select LIBCRC32C
+       select CONFIG_CRYPTO_AES
+       help
+         Choose Y or M here to include support for mounting the
+         experimental Ceph distributed file system.  Ceph is an extremely
+         scalable file system designed to provide high performance,
+         reliable access to petabytes of storage.
+
+         More information at http://ceph.newdream.net/.
+
+         If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+       bool "Include file:line in ceph debug output"
+       depends on CEPH_FS
+       default n
+       help
+         If you say Y here, debug output will include a filename and
+         line to aid debugging.  This icnreases kernel size and slows
+         execution slightly when debug call sites are enabled (e.g.,
+         via CONFIG_DYNAMIC_DEBUG).
+
+         If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644 (file)
index 0000000..6a660e6
--- /dev/null
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+       export.o caps.o snap.o xattr.o \
+       messenger.o msgpool.o buffer.o pagelist.o \
+       mds_client.o mdsmap.o \
+       mon_client.o \
+       osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+       debugfs.o \
+       auth.o auth_none.o \
+       crypto.o armor.o \
+       auth_x.o \
+       ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644 (file)
index 0000000..18352fa
--- /dev/null
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland                  kernel
+src/include/ceph_fs.h      fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc     fs/ceph/ceph_fs.c
+src/include/msgr.h         fs/ceph/msgr.h
+src/include/rados.h        fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h            fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/include/ceph_hash.h            fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
+src/crush/crush.c          fs/ceph/crush/crush.c
+src/crush/crush.h          fs/ceph/crush/crush.h
+src/crush/mapper.c         fs/ceph/crush/mapper.c
+src/crush/mapper.h         fs/ceph/crush/mapper.h
+src/crush/hash.h           fs/ceph/crush/hash.h
+src/crush/hash.c           fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644 (file)
index 0000000..23bb0ce
--- /dev/null
@@ -0,0 +1,1188 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>   /* generic_writepages */
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)                           \
+       (CONGESTION_ON_THRESH(congestion_kb) -                          \
+        (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
+
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+       struct inode *inode;
+       struct ceph_inode_info *ci;
+       int undo = 0;
+       struct ceph_snap_context *snapc;
+
+       if (unlikely(!mapping))
+               return !TestSetPageDirty(page);
+
+       if (TestSetPageDirty(page)) {
+               dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+                    mapping->host, page, page->index);
+               return 0;
+       }
+
+       inode = mapping->host;
+       ci = ceph_inode(inode);
+
+       /*
+        * Note that we're grabbing a snapc ref here without holding
+        * any locks!
+        */
+       snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+       /* dirty the head */
+       spin_lock(&inode->i_lock);
+       if (ci->i_wrbuffer_ref_head == 0)
+               ci->i_head_snapc = ceph_get_snap_context(snapc);
+       ++ci->i_wrbuffer_ref_head;
+       if (ci->i_wrbuffer_ref == 0)
+               igrab(inode);
+       ++ci->i_wrbuffer_ref;
+       dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+            "snapc %p seq %lld (%d snaps)\n",
+            mapping->host, page, page->index,
+            ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+            ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+            snapc, snapc->seq, snapc->num_snaps);
+       spin_unlock(&inode->i_lock);
+
+       /* now adjust page */
+       spin_lock_irq(&mapping->tree_lock);
+       if (page->mapping) {    /* Race with truncate? */
+               WARN_ON_ONCE(!PageUptodate(page));
+
+               if (mapping_cap_account_dirty(mapping)) {
+                       __inc_zone_page_state(page, NR_FILE_DIRTY);
+                       __inc_bdi_stat(mapping->backing_dev_info,
+                                       BDI_RECLAIMABLE);
+                       task_io_account_write(PAGE_CACHE_SIZE);
+               }
+               radix_tree_tag_set(&mapping->page_tree,
+                               page_index(page), PAGECACHE_TAG_DIRTY);
+
+               /*
+                * Reference snap context in page->private.  Also set
+                * PagePrivate so that we get invalidatepage callback.
+                */
+               page->private = (unsigned long)snapc;
+               SetPagePrivate(page);
+       } else {
+               dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+               undo = 1;
+       }
+
+       spin_unlock_irq(&mapping->tree_lock);
+
+       if (undo)
+               /* whoops, we failed to dirty the page */
+               ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+       BUG_ON(!PageDirty(page));
+       return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+       struct inode *inode;
+       struct ceph_inode_info *ci;
+       struct ceph_snap_context *snapc = (void *)page->private;
+
+       BUG_ON(!PageLocked(page));
+       BUG_ON(!page->private);
+       BUG_ON(!PagePrivate(page));
+       BUG_ON(!page->mapping);
+
+       inode = page->mapping->host;
+
+       /*
+        * We can get non-dirty pages here due to races between
+        * set_page_dirty and truncate_complete_page; just spit out a
+        * warning, in case we end up with accounting problems later.
+        */
+       if (!PageDirty(page))
+               pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+       if (offset == 0)
+               ClearPageChecked(page);
+
+       ci = ceph_inode(inode);
+       if (offset == 0) {
+               dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+                    inode, page, page->index, offset);
+               ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+               ceph_put_snap_context(snapc);
+               page->private = 0;
+               ClearPagePrivate(page);
+       } else {
+               dout("%p invalidatepage %p idx %lu partial dirty page\n",
+                    inode, page, page->index);
+       }
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+       struct inode *inode = page->mapping ? page->mapping->host : NULL;
+       dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+       WARN_ON(PageDirty(page));
+       WARN_ON(page->private);
+       WARN_ON(PagePrivate(page));
+       return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+       struct inode *inode = filp->f_dentry->d_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+       int err = 0;
+       u64 len = PAGE_CACHE_SIZE;
+
+       dout("readpage inode %p file %p page %p index %lu\n",
+            inode, filp, page, page->index);
+       err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+                                 page->index << PAGE_CACHE_SHIFT, &len,
+                                 ci->i_truncate_seq, ci->i_truncate_size,
+                                 &page, 1);
+       if (err == -ENOENT)
+               err = 0;
+       if (err < 0) {
+               SetPageError(page);
+               goto out;
+       } else if (err < PAGE_CACHE_SIZE) {
+               /* zero fill remainder of page */
+               zero_user_segment(page, err, PAGE_CACHE_SIZE);
+       }
+       SetPageUptodate(page);
+
+out:
+       return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+       int r = readpage_nounlock(filp, page);
+       unlock_page(page);
+       return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+                                          unsigned *nr_pages)
+{
+       struct page **pages;
+       struct page *page;
+       int next_index, contig_pages = 0;
+
+       /* build page vector */
+       pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+       if (!pages)
+               return ERR_PTR(-ENOMEM);
+
+       BUG_ON(list_empty(page_list));
+       next_index = list_entry(page_list->prev, struct page, lru)->index;
+       list_for_each_entry_reverse(page, page_list, lru) {
+               if (page->index == next_index) {
+                       dout("readpages page %d %p\n", contig_pages, page);
+                       pages[contig_pages] = page;
+                       contig_pages++;
+                       next_index++;
+               } else {
+                       break;
+               }
+       }
+       *nr_pages = contig_pages;
+       return pages;
+}
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+                         struct list_head *page_list, unsigned nr_pages)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+       int rc = 0;
+       struct page **pages;
+       struct pagevec pvec;
+       loff_t offset;
+       u64 len;
+
+       dout("readpages %p file %p nr_pages %d\n",
+            inode, file, nr_pages);
+
+       pages = page_vector_from_list(page_list, &nr_pages);
+       if (IS_ERR(pages))
+               return PTR_ERR(pages);
+
+       /* guess read extent */
+       offset = pages[0]->index << PAGE_CACHE_SHIFT;
+       len = nr_pages << PAGE_CACHE_SHIFT;
+       rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+                                offset, &len,
+                                ci->i_truncate_seq, ci->i_truncate_size,
+                                pages, nr_pages);
+       if (rc == -ENOENT)
+               rc = 0;
+       if (rc < 0)
+               goto out;
+
+       /* set uptodate and add to lru in pagevec-sized chunks */
+       pagevec_init(&pvec, 0);
+       for (; !list_empty(page_list) && len > 0;
+            rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+               struct page *page =
+                       list_entry(page_list->prev, struct page, lru);
+
+               list_del(&page->lru);
+
+               if (rc < (int)PAGE_CACHE_SIZE) {
+                       /* zero (remainder of) page */
+                       int s = rc < 0 ? 0 : rc;
+                       zero_user_segment(page, s, PAGE_CACHE_SIZE);
+               }
+
+               if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                       page_cache_release(page);
+                       dout("readpages %p add_to_page_cache failed %p\n",
+                            inode, page);
+                       continue;
+               }
+               dout("readpages %p adding %p idx %lu\n", inode, page,
+                    page->index);
+               flush_dcache_page(page);
+               SetPageUptodate(page);
+               unlock_page(page);
+               if (pagevec_add(&pvec, page) == 0)
+                       pagevec_lru_add_file(&pvec);   /* add to lru */
+       }
+       pagevec_lru_add_file(&pvec);
+       rc = 0;
+
+out:
+       kfree(pages);
+       return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ *
+ * Caller holds i_lock.
+ */
+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+                                                     u64 *snap_size)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_snap_context *snapc = NULL;
+       struct ceph_cap_snap *capsnap = NULL;
+
+       list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+               dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+                    capsnap->context, capsnap->dirty_pages);
+               if (capsnap->dirty_pages) {
+                       snapc = ceph_get_snap_context(capsnap->context);
+                       if (snap_size)
+                               *snap_size = capsnap->size;
+                       break;
+               }
+       }
+       if (!snapc && ci->i_snap_realm) {
+               snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+               dout(" head snapc %p has %d dirty pages\n",
+                    snapc, ci->i_wrbuffer_ref_head);
+       }
+       return snapc;
+}
+
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+                                                   u64 *snap_size)
+{
+       struct ceph_snap_context *snapc = NULL;
+
+       spin_lock(&inode->i_lock);
+       snapc = __get_oldest_context(inode, snap_size);
+       spin_unlock(&inode->i_lock);
+       return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+       struct inode *inode;
+       struct ceph_inode_info *ci;
+       struct ceph_client *client;
+       struct ceph_osd_client *osdc;
+       loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+       int len = PAGE_CACHE_SIZE;
+       loff_t i_size;
+       int err = 0;
+       struct ceph_snap_context *snapc;
+       u64 snap_size = 0;
+       long writeback_stat;
+
+       dout("writepage %p idx %lu\n", page, page->index);
+
+       if (!page->mapping || !page->mapping->host) {
+               dout("writepage %p - no mapping\n", page);
+               return -EFAULT;
+       }
+       inode = page->mapping->host;
+       ci = ceph_inode(inode);
+       client = ceph_inode_to_client(inode);
+       osdc = &client->osdc;
+
+       /* verify this is a writeable snap context */
+       snapc = (void *)page->private;
+       if (snapc == NULL) {
+               dout("writepage %p page %p not dirty?\n", inode, page);
+               goto out;
+       }
+       if (snapc != get_oldest_context(inode, &snap_size)) {
+               dout("writepage %p page %p snapc %p not writeable - noop\n",
+                    inode, page, (void *)page->private);
+               /* we should only noop if called by kswapd */
+               WARN_ON((current->flags & PF_MEMALLOC) == 0);
+               goto out;
+       }
+
+       /* is this a partial page at end of file? */
+       if (snap_size)
+               i_size = snap_size;
+       else
+               i_size = i_size_read(inode);
+       if (i_size < page_off + len)
+               len = i_size - page_off;
+
+       dout("writepage %p page %p index %lu on %llu~%u\n",
+            inode, page, page->index, page_off, len);
+
+       writeback_stat = atomic_long_inc_return(&client->writeback_count);
+       if (writeback_stat >
+           CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+               set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
+       set_page_writeback(page);
+       err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+                                  &ci->i_layout, snapc,
+                                  page_off, len,
+                                  ci->i_truncate_seq, ci->i_truncate_size,
+                                  &inode->i_mtime,
+                                  &page, 1, 0, 0, true);
+       if (err < 0) {
+               dout("writepage setting page/mapping error %d %p\n", err, page);
+               SetPageError(page);
+               mapping_set_error(&inode->i_data, err);
+               if (wbc)
+                       wbc->pages_skipped++;
+       } else {
+               dout("writepage cleaned page %p\n", page);
+               err = 0;  /* vfs expects us to return 0 */
+       }
+       page->private = 0;
+       ClearPagePrivate(page);
+       end_page_writeback(page);
+       ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+       ceph_put_snap_context(snapc);
+out:
+       return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+       int err;
+       struct inode *inode = page->mapping->host;
+       BUG_ON(!inode);
+       igrab(inode);
+       err = writepage_nounlock(page, wbc);
+       unlock_page(page);
+       iput(inode);
+       return err;
+}
+
+
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+       struct pagevec pvec;
+       int i;
+
+       pagevec_init(&pvec, 0);
+       for (i = 0; i < num; i++) {
+               if (pagevec_add(&pvec, pages[i]) == 0)
+                       pagevec_release(&pvec);
+       }
+       pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+                             struct ceph_msg *msg)
+{
+       struct inode *inode = req->r_inode;
+       struct ceph_osd_reply_head *replyhead;
+       struct ceph_osd_op *op;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       unsigned wrote;
+       struct page *page;
+       int i;
+       struct ceph_snap_context *snapc = req->r_snapc;
+       struct address_space *mapping = inode->i_mapping;
+       struct writeback_control *wbc = req->r_wbc;
+       __s32 rc = -EIO;
+       u64 bytes = 0;
+       struct ceph_client *client = ceph_inode_to_client(inode);
+       long writeback_stat;
+       unsigned issued = __ceph_caps_issued(ci, NULL);
+
+       /* parse reply */
+       replyhead = msg->front.iov_base;
+       WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+       op = (void *)(replyhead + 1);
+       rc = le32_to_cpu(replyhead->result);
+       bytes = le64_to_cpu(op->extent.length);
+
+       if (rc >= 0) {
+               /*
+                * Assume we wrote the pages we originally sent.  The
+                * osd might reply with fewer pages if our writeback
+                * raced with a truncation and was adjusted at the osd,
+                * so don't believe the reply.
+                */
+               wrote = req->r_num_pages;
+       } else {
+               wrote = 0;
+               mapping_set_error(mapping, rc);
+       }
+       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+            inode, rc, bytes, wrote);
+
+       /* clean all pages */
+       for (i = 0; i < req->r_num_pages; i++) {
+               page = req->r_pages[i];
+               BUG_ON(!page);
+               WARN_ON(!PageUptodate(page));
+
+               writeback_stat =
+                       atomic_long_dec_return(&client->writeback_count);
+               if (writeback_stat <
+                   CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+                       clear_bdi_congested(&client->backing_dev_info,
+                                           BLK_RW_ASYNC);
+
+               if (i >= wrote) {
+                       dout("inode %p skipping page %p\n", inode, page);
+                       wbc->pages_skipped++;
+               }
+               page->private = 0;
+               ClearPagePrivate(page);
+               ceph_put_snap_context(snapc);
+               dout("unlocking %d %p\n", i, page);
+               end_page_writeback(page);
+
+               /*
+                * We lost the cache cap, need to truncate the page before
+                * it is unlocked, otherwise we'd truncate it later in the
+                * page truncation thread, possibly losing some data that
+                * raced its way in
+                */
+               if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+                       generic_error_remove_page(inode->i_mapping, page);
+
+               unlock_page(page);
+       }
+       dout("%p wrote+cleaned %d pages\n", inode, wrote);
+       ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+       ceph_release_pages(req->r_pages, req->r_num_pages);
+       if (req->r_pages_from_pool)
+               mempool_free(req->r_pages,
+                            ceph_client(inode->i_sb)->wb_pagevec_pool);
+       else
+               kfree(req->r_pages);
+       ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+                          struct ceph_osd_request *req)
+{
+       req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+                              GFP_NOFS);
+       if (!req->r_pages) {
+               req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+               req->r_pages_from_pool = 1;
+               WARN_ON(!req->r_pages);
+       }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+                                struct writeback_control *wbc)
+{
+       struct inode *inode = mapping->host;
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_client *client;
+       pgoff_t index, start, end;
+       int range_whole = 0;
+       int should_loop = 1;
+       pgoff_t max_pages = 0, max_pages_ever = 0;
+       struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+       struct pagevec pvec;
+       int done = 0;
+       int rc = 0;
+       unsigned wsize = 1 << inode->i_blkbits;
+       struct ceph_osd_request *req = NULL;
+       int do_sync;
+       u64 snap_size = 0;
+
+       /*
+        * Include a 'sync' in the OSD request if this is a data
+        * integrity write (e.g., O_SYNC write or fsync()), or if our
+        * cap is being revoked.
+        */
+       do_sync = wbc->sync_mode == WB_SYNC_ALL;
+       if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+               do_sync = 1;
+       dout("writepages_start %p dosync=%d (mode=%s)\n",
+            inode, do_sync,
+            wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+            (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+       client = ceph_inode_to_client(inode);
+       if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+               pr_warning("writepage_start %p on forced umount\n", inode);
+               return -EIO; /* we're in a forced umount, don't write! */
+       }
+       if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+               wsize = client->mount_args->wsize;
+       if (wsize < PAGE_CACHE_SIZE)
+               wsize = PAGE_CACHE_SIZE;
+       max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+       pagevec_init(&pvec, 0);
+
+       /* ?? */
+       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+               dout(" writepages congested\n");
+               wbc->encountered_congestion = 1;
+               goto out_final;
+       }
+
+       /* where to start/end? */
+       if (wbc->range_cyclic) {
+               start = mapping->writeback_index; /* Start from prev offset */
+               end = -1;
+               dout(" cyclic, start at %lu\n", start);
+       } else {
+               start = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                       range_whole = 1;
+               should_loop = 0;
+               dout(" not cyclic, %lu to %lu\n", start, end);
+       }
+       index = start;
+
+retry:
+       /* find oldest snap context with dirty data */
+       ceph_put_snap_context(snapc);
+       snapc = get_oldest_context(inode, &snap_size);
+       if (!snapc) {
+               /* hmm, why does writepages get called when there
+                  is no dirty data? */
+               dout(" no snap context with dirty data?\n");
+               goto out;
+       }
+       dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+            snapc, snapc->seq, snapc->num_snaps);
+       if (last_snapc && snapc != last_snapc) {
+               /* if we switched to a newer snapc, restart our scan at the
+                * start of the original file range. */
+               dout("  snapc differs from last pass, restarting at %lu\n",
+                    index);
+               index = start;
+       }
+       last_snapc = snapc;
+
+       while (!done && index <= end) {
+               unsigned i;
+               int first;
+               pgoff_t next;
+               int pvec_pages, locked_pages;
+               struct page *page;
+               int want;
+               u64 offset, len;
+               struct ceph_osd_request_head *reqhead;
+               struct ceph_osd_op *op;
+               long writeback_stat;
+
+               next = 0;
+               locked_pages = 0;
+               max_pages = max_pages_ever;
+
+get_more_pages:
+               first = -1;
+               want = min(end - index,
+                          min((pgoff_t)PAGEVEC_SIZE,
+                              max_pages - (pgoff_t)locked_pages) - 1)
+                       + 1;
+               pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                               PAGECACHE_TAG_DIRTY,
+                                               want);
+               dout("pagevec_lookup_tag got %d\n", pvec_pages);
+               if (!pvec_pages && !locked_pages)
+                       break;
+               for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+                       page = pvec.pages[i];
+                       dout("? %p idx %lu\n", page, page->index);
+                       if (locked_pages == 0)
+                               lock_page(page);  /* first page */
+                       else if (!trylock_page(page))
+                               break;
+
+                       /* only dirty pages, or our accounting breaks */
+                       if (unlikely(!PageDirty(page)) ||
+                           unlikely(page->mapping != mapping)) {
+                               dout("!dirty or !mapping %p\n", page);
+                               unlock_page(page);
+                               break;
+                       }
+                       if (!wbc->range_cyclic && page->index > end) {
+                               dout("end of range %p\n", page);
+                               done = 1;
+                               unlock_page(page);
+                               break;
+                       }
+                       if (next && (page->index != next)) {
+                               dout("not consecutive %p\n", page);
+                               unlock_page(page);
+                               break;
+                       }
+                       if (wbc->sync_mode != WB_SYNC_NONE) {
+                               dout("waiting on writeback %p\n", page);
+                               wait_on_page_writeback(page);
+                       }
+                       if ((snap_size && page_offset(page) > snap_size) ||
+                           (!snap_size &&
+                            page_offset(page) > i_size_read(inode))) {
+                               dout("%p page eof %llu\n", page, snap_size ?
+                                    snap_size : i_size_read(inode));
+                               done = 1;
+                               unlock_page(page);
+                               break;
+                       }
+                       if (PageWriteback(page)) {
+                               dout("%p under writeback\n", page);
+                               unlock_page(page);
+                               break;
+                       }
+
+                       /* only if matching snap context */
+                       if (snapc != (void *)page->private) {
+                               dout("page snapc %p != oldest %p\n",
+                                    (void *)page->private, snapc);
+                               unlock_page(page);
+                               if (!locked_pages)
+                                       continue; /* keep looking for snap */
+                               break;
+                       }
+
+                       if (!clear_page_dirty_for_io(page)) {
+                               dout("%p !clear_page_dirty_for_io\n", page);
+                               unlock_page(page);
+                               break;
+                       }
+
+                       /* ok */
+                       if (locked_pages == 0) {
+                               /* prepare async write request */
+                               offset = page->index << PAGE_CACHE_SHIFT;
+                               len = wsize;
+                               req = ceph_osdc_new_request(&client->osdc,
+                                           &ci->i_layout,
+                                           ceph_vino(inode),
+                                           offset, &len,
+                                           CEPH_OSD_OP_WRITE,
+                                           CEPH_OSD_FLAG_WRITE |
+                                                   CEPH_OSD_FLAG_ONDISK,
+                                           snapc, do_sync,
+                                           ci->i_truncate_seq,
+                                           ci->i_truncate_size,
+                                           &inode->i_mtime, true, 1);
+                               max_pages = req->r_num_pages;
+
+                               alloc_page_vec(client, req);
+                               req->r_callback = writepages_finish;
+                               req->r_inode = inode;
+                               req->r_wbc = wbc;
+                       }
+
+                       /* note position of first page in pvec */
+                       if (first < 0)
+                               first = i;
+                       dout("%p will write page %p idx %lu\n",
+                            inode, page, page->index);
+
+                       writeback_stat = atomic_long_inc_return(&client->writeback_count);
+                       if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+                               set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                       }
+
+                       set_page_writeback(page);
+                       req->r_pages[locked_pages] = page;
+                       locked_pages++;
+                       next = page->index + 1;
+               }
+
+               /* did we get anything? */
+               if (!locked_pages)
+                       goto release_pvec_pages;
+               if (i) {
+                       int j;
+                       BUG_ON(!locked_pages || first < 0);
+
+                       if (pvec_pages && i == pvec_pages &&
+                           locked_pages < max_pages) {
+                               dout("reached end pvec, trying for more\n");
+                               pagevec_reinit(&pvec);
+                               goto get_more_pages;
+                       }
+
+                       /* shift unused pages over in the pvec...  we
+                        * will need to release them below. */
+                       for (j = i; j < pvec_pages; j++) {
+                               dout(" pvec leftover page %p\n",
+                                    pvec.pages[j]);
+                               pvec.pages[j-i+first] = pvec.pages[j];
+                       }
+                       pvec.nr -= i-first;
+               }
+
+               /* submit the write */
+               offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+               len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+                         (u64)locked_pages << PAGE_CACHE_SHIFT);
+               dout("writepages got %d pages at %llu~%llu\n",
+                    locked_pages, offset, len);
+
+               /* revise final length, page count */
+               req->r_num_pages = locked_pages;
+               reqhead = req->r_request->front.iov_base;
+               op = (void *)(reqhead + 1);
+               op->extent.length = cpu_to_le64(len);
+               op->payload_len = cpu_to_le32(len);
+               req->r_request->hdr.data_len = cpu_to_le32(len);
+
+               ceph_osdc_start_request(&client->osdc, req, true);
+               req = NULL;
+
+               /* continue? */
+               index = next;
+               wbc->nr_to_write -= locked_pages;
+               if (wbc->nr_to_write <= 0)
+                       done = 1;
+
+release_pvec_pages:
+               dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+                    pvec.nr ? pvec.pages[0] : NULL);
+               pagevec_release(&pvec);
+
+               if (locked_pages && !done)
+                       goto retry;
+       }
+
+       if (should_loop && !done) {
+               /* more to do; loop back to beginning of file */
+               dout("writepages looping back to beginning of file\n");
+               should_loop = 0;
+               index = 0;
+               goto retry;
+       }
+
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               mapping->writeback_index = index;
+
+out:
+       if (req)
+               ceph_osdc_put_request(req);
+       if (rc > 0)
+               rc = 0;  /* vfs expects us to return 0 */
+       ceph_put_snap_context(snapc);
+       dout("writepages done, rc = %d\n", rc);
+out_final:
+       return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+                                          struct ceph_snap_context *snapc)
+{
+       struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+       return !oldest || snapc->seq <= oldest->seq;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_update_writeable_page(struct file *file,
+                           loff_t pos, unsigned len,
+                           struct page *page)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       loff_t page_off = pos & PAGE_CACHE_MASK;
+       int pos_in_page = pos & ~PAGE_CACHE_MASK;
+       int end_in_page = pos_in_page + len;
+       loff_t i_size;
+       struct ceph_snap_context *snapc;
+       int r;
+
+retry_locked:
+       /* writepages currently holds page lock, but if we change that later, */
+       wait_on_page_writeback(page);
+
+       /* check snap context */
+       BUG_ON(!ci->i_snap_realm);
+       down_read(&mdsc->snap_rwsem);
+       BUG_ON(!ci->i_snap_realm->cached_context);
+       if (page->private &&
+           (void *)page->private != ci->i_snap_realm->cached_context) {
+               /*
+                * this page is already dirty in another (older) snap
+                * context!  is it writeable now?
+                */
+               snapc = get_oldest_context(inode, NULL);
+               up_read(&mdsc->snap_rwsem);
+
+               if (snapc != (void *)page->private) {
+                       dout(" page %p snapc %p not current or oldest\n",
+                            page, (void *)page->private);
+                       /*
+                        * queue for writeback, and wait for snapc to
+                        * be writeable or written
+                        */
+                       snapc = ceph_get_snap_context((void *)page->private);
+                       unlock_page(page);
+                       ceph_queue_writeback(inode);
+                       wait_event_interruptible(ci->i_cap_wq,
+                              context_is_writeable_or_written(inode, snapc));
+                       ceph_put_snap_context(snapc);
+                       return -EAGAIN;
+               }
+
+               /* yay, writeable, do it now (without dropping page lock) */
+               dout(" page %p snapc %p not current, but oldest\n",
+                    page, snapc);
+               if (!clear_page_dirty_for_io(page))
+                       goto retry_locked;
+               r = writepage_nounlock(page, NULL);
+               if (r < 0)
+                       goto fail_nosnap;
+               goto retry_locked;
+       }
+
+       if (PageUptodate(page)) {
+               dout(" page %p already uptodate\n", page);
+               return 0;
+       }
+
+       /* full page? */
+       if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+               return 0;
+
+       /* past end of file? */
+       i_size = inode->i_size;   /* caller holds i_mutex */
+
+       if (i_size + len > inode->i_sb->s_maxbytes) {
+               /* file is too big */
+               r = -EINVAL;
+               goto fail;
+       }
+
+       if (page_off >= i_size ||
+           (pos_in_page == 0 && (pos+len) >= i_size &&
+            end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+               dout(" zeroing %p 0 - %d and %d - %d\n",
+                    page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+               zero_user_segments(page,
+                                  0, pos_in_page,
+                                  end_in_page, PAGE_CACHE_SIZE);
+               return 0;
+       }
+
+       /* we need to read it. */
+       up_read(&mdsc->snap_rwsem);
+       r = readpage_nounlock(file, page);
+       if (r < 0)
+               goto fail_nosnap;
+       goto retry_locked;
+
+fail:
+       up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+       unlock_page(page);
+       return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned flags,
+                           struct page **pagep, void **fsdata)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct page *page;
+       pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+       int r;
+
+       do {
+               /* get a page*/
+               page = grab_cache_page_write_begin(mapping, index, 0);
+               if (!page)
+                       return -ENOMEM;
+               *pagep = page;
+
+               dout("write_begin file %p inode %p page %p %d~%d\n", file,
+               inode, page, (int)pos, (int)len);
+
+               r = ceph_update_writeable_page(file, pos, len, page);
+       } while (r == -EAGAIN);
+
+       return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+                         loff_t pos, unsigned len, unsigned copied,
+                         struct page *page, void *fsdata)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_mds_client *mdsc = &client->mdsc;
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       int check_cap = 0;
+
+       dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+            inode, page, (int)pos, (int)copied, (int)len);
+
+       /* zero the stale part of the page if we did a short copy */
+       if (copied < len)
+               zero_user_segment(page, from+copied, len);
+
+       /* did file size increase? */
+       /* (no need for i_size_read(); we caller holds i_mutex */
+       if (pos+copied > inode->i_size)
+               check_cap = ceph_inode_set_size(inode, pos+copied);
+
+       if (!PageUptodate(page))
+               SetPageUptodate(page);
+
+       set_page_dirty(page);
+
+       unlock_page(page);
+       up_read(&mdsc->snap_rwsem);
+       page_cache_release(page);
+
+       if (check_cap)
+               ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+       return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+                             const struct iovec *iov,
+                             loff_t pos, unsigned long nr_segs)
+{
+       WARN_ON(1);
+       return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+       .readpage = ceph_readpage,
+       .readpages = ceph_readpages,
+       .writepage = ceph_writepage,
+       .writepages = ceph_writepages_start,
+       .write_begin = ceph_write_begin,
+       .write_end = ceph_write_end,
+       .set_page_dirty = ceph_set_page_dirty,
+       .invalidatepage = ceph_invalidatepage,
+       .releasepage = ceph_releasepage,
+       .direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct inode *inode = vma->vm_file->f_dentry->d_inode;
+       struct page *page = vmf->page;
+       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       loff_t off = page->index << PAGE_CACHE_SHIFT;
+       loff_t size, len;
+       int ret;
+
+       size = i_size_read(inode);
+       if (off + PAGE_CACHE_SIZE <= size)
+               len = PAGE_CACHE_SIZE;
+       else
+               len = size & ~PAGE_CACHE_MASK;
+
+       dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+            off, len, page, page->index);
+
+       lock_page(page);
+
+       ret = VM_FAULT_NOPAGE;
+       if ((off > size) ||
+           (page->mapping != inode->i_mapping))
+               goto out;
+
+       ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+       if (ret == 0) {
+               /* success.  we'll keep the page locked. */
+               set_page_dirty(page);
+               up_read(&mdsc->snap_rwsem);
+               ret = VM_FAULT_LOCKED;
+       } else {
+               if (ret == -ENOMEM)
+                       ret = VM_FAULT_OOM;
+               else
+                       ret = VM_FAULT_SIGBUS;
+       }
+out:
+       dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+       if (ret != VM_FAULT_LOCKED)
+               unlock_page(page);
+       return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+       .fault          = filemap_fault,
+       .page_mkwrite   = ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct address_space *mapping = file->f_mapping;
+
+       if (!mapping->a_ops->readpage)
+               return -ENOEXEC;
+       file_accessed(file);
+       vma->vm_ops = &ceph_vmops;
+       vma->vm_flags |= VM_CAN_NONLINEAR;
+       return 0;
+}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644 (file)
index 0000000..67b2c03
--- /dev/null
@@ -0,0 +1,99 @@
+
+#include <linux/errno.h>
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+       return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+       if (c >= 'A' && c <= 'Z')
+               return c - 'A';
+       if (c >= 'a' && c <= 'z')
+               return c - 'a' + 26;
+       if (c >= '0' && c <= '9')
+               return c - '0' + 52;
+       if (c == '+')
+               return 62;
+       if (c == '/')
+               return 63;
+       if (c == '=')
+               return 0; /* just non-negative, please */
+       return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+       int olen = 0;
+       int line = 0;
+
+       while (src < end) {
+               unsigned char a, b, c;
+
+               a = *src++;
+               *dst++ = encode_bits(a >> 2);
+               if (src < end) {
+                       b = *src++;
+                       *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+                       if (src < end) {
+                               c = *src++;
+                               *dst++ = encode_bits(((b & 15) << 2) |
+                                                    (c >> 6));
+                               *dst++ = encode_bits(c & 63);
+                       } else {
+                               *dst++ = encode_bits((b & 15) << 2);
+                               *dst++ = '=';
+                       }
+               } else {
+                       *dst++ = encode_bits(((a & 3) << 4));
+                       *dst++ = '=';
+                       *dst++ = '=';
+               }
+               olen += 4;
+               line += 4;
+               if (line == 64) {
+                       line = 0;
+                       *(dst++) = '\n';
+                       olen++;
+               }
+       }
+       return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+       int olen = 0;
+
+       while (src < end) {
+               int a, b, c, d;
+
+               if (src < end && src[0] == '\n')
+                       src++;
+               if (src + 4 > end)
+                       return -EINVAL;
+               a = decode_bits(src[0]);
+               b = decode_bits(src[1]);
+               c = decode_bits(src[2]);
+               d = decode_bits(src[3]);
+               if (a < 0 || b < 0 || c < 0 || d < 0)
+                       return -EINVAL;
+
+               *dst++ = (a << 2) | (b >> 4);
+               if (src[2] == '=')
+                       return olen + 1;
+               *dst++ = ((b & 15) << 4) | (c >> 2);
+               if (src[3] == '=')
+                       return olen + 2;
+               *dst++ = ((c & 3) << 6) | d;
+               olen += 3;
+               src += 4;
+       }
+       return olen;
+}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644 (file)
index 0000000..abb204f
--- /dev/null
@@ -0,0 +1,257 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/err.h>
+
+#include "types.h"
+#include "auth_none.h"
+#include "auth_x.h"
+#include "decode.h"
+#include "super.h"
+
+#include "messenger.h"
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+       CEPH_AUTH_NONE,
+       CEPH_AUTH_CEPHX
+};
+
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+       switch (protocol) {
+       case CEPH_AUTH_NONE:
+               return ceph_auth_none_init(ac);
+       case CEPH_AUTH_CEPHX:
+               return ceph_x_init(ac);
+       default:
+               return -ENOENT;
+       }
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+       struct ceph_auth_client *ac;
+       int ret;
+
+       dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+       ret = -ENOMEM;
+       ac = kzalloc(sizeof(*ac), GFP_NOFS);
+       if (!ac)
+               goto out;
+
+       ac->negotiating = true;
+       if (name)
+               ac->name = name;
+       else
+               ac->name = CEPH_AUTH_NAME_DEFAULT;
+       dout("auth_init name %s secret %s\n", ac->name, secret);
+       ac->secret = secret;
+       return ac;
+
+out:
+       return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+       dout("auth_destroy %p\n", ac);
+       if (ac->ops)
+               ac->ops->destroy(ac);
+       kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+       dout("auth_reset %p\n", ac);
+       if (ac->ops && !ac->negotiating)
+               ac->ops->reset(ac);
+       ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+       int len = strlen(name);
+
+       if (*p + 2*sizeof(u32) + len > end)
+               return -ERANGE;
+       ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+       ceph_encode_32(p, len);
+       ceph_encode_copy(p, name, len);
+       return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+       struct ceph_mon_request_header *monhdr = buf;
+       void *p = monhdr + 1, *end = buf + len, *lenp;
+       int i, num;
+       int ret;
+
+       dout("auth_build_hello\n");
+       monhdr->have_version = 0;
+       monhdr->session_mon = cpu_to_le16(-1);
+       monhdr->session_mon_tid = 0;
+
+       ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+       lenp = p;
+       p += sizeof(u32);
+
+       ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+       ceph_encode_8(&p, 1);
+       num = ARRAY_SIZE(supported_protocols);
+       ceph_encode_32(&p, num);
+       ceph_decode_need(&p, end, num * sizeof(u32), bad);
+       for (i = 0; i < num; i++)
+               ceph_encode_32(&p, supported_protocols[i]);
+
+       ret = ceph_entity_name_encode(ac->name, &p, end);
+       if (ret < 0)
+               return ret;
+       ceph_decode_need(&p, end, sizeof(u64), bad);
+       ceph_encode_64(&p, ac->global_id);
+
+       ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+       return p - buf;
+
+bad:
+       return -ERANGE;
+}
+
+int ceph_build_auth_request(struct ceph_auth_client *ac,
+                          void *msg_buf, size_t msg_len)
+{
+       struct ceph_mon_request_header *monhdr = msg_buf;
+       void *p = monhdr + 1;
+       void *end = msg_buf + msg_len;
+       int ret;
+
+       monhdr->have_version = 0;
+       monhdr->session_mon = cpu_to_le16(-1);
+       monhdr->session_mon_tid = 0;
+
+       ceph_encode_32(&p, ac->protocol);
+
+       ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+       if (ret < 0) {
+               pr_err("error %d building request\n", ret);
+               return ret;
+       }
+       dout(" built request %d bytes\n", ret);
+       ceph_encode_32(&p, ret);
+       return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                          void *buf, size_t len,
+                          void *reply_buf, size_t reply_len)
+{
+       void *p = buf;
+       void *end = buf + len;
+       int protocol;
+       s32 result;
+       u64 global_id;
+       void *payload, *payload_end;
+       int payload_len;
+       char *result_msg;
+       int result_msg_len;
+       int ret = -EINVAL;
+
+       dout("handle_auth_reply %p %p\n", p, end);
+       ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+       protocol = ceph_decode_32(&p);
+       result = ceph_decode_32(&p);
+       global_id = ceph_decode_64(&p);
+       payload_len = ceph_decode_32(&p);
+       payload = p;
+       p += payload_len;
+       ceph_decode_need(&p, end, sizeof(u32), bad);
+       result_msg_len = ceph_decode_32(&p);
+       result_msg = p;
+       p += result_msg_len;
+       if (p != end)
+               goto bad;
+
+       dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+            result_msg, global_id, payload_len);
+
+       payload_end = payload + payload_len;
+
+       if (global_id && ac->global_id != global_id) {
+               dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+               ac->global_id = global_id;
+       }
+
+       if (ac->negotiating) {
+               /* server does not support our protocols? */
+               if (!protocol && result < 0) {
+                       ret = result;
+                       goto out;
+               }
+               /* set up (new) protocol handler? */
+               if (ac->protocol && ac->protocol != protocol) {
+                       ac->ops->destroy(ac);
+                       ac->protocol = 0;
+                       ac->ops = NULL;
+               }
+               if (ac->protocol != protocol) {
+                       ret = ceph_auth_init_protocol(ac, protocol);
+                       if (ret) {
+                               pr_err("error %d on auth protocol %d init\n",
+                                      ret, protocol);
+                               goto out;
+                       }
+               }
+
+               ac->negotiating = false;
+       }
+
+       ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+       if (ret == -EAGAIN) {
+               return ceph_build_auth_request(ac, reply_buf, reply_len);
+       } else if (ret) {
+               pr_err("authentication error %d\n", ret);
+               return ret;
+       }
+       return 0;
+
+bad:
+       pr_err("failed to decode auth msg\n");
+out:
+       return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+                   void *msg_buf, size_t msg_len)
+{
+       if (!ac->protocol)
+               return ceph_auth_build_hello(ac, msg_buf, msg_len);
+       BUG_ON(!ac->ops);
+       if (!ac->ops->is_authenticated(ac))
+               return ceph_build_auth_request(ac, msg_buf, msg_len);
+       return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+       if (!ac->ops)
+               return 0;
+       return ac->ops->is_authenticated(ac);
+}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644 (file)
index 0000000..ca4f57c
--- /dev/null
@@ -0,0 +1,84 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+       /*
+        * true if we are authenticated and can connect to
+        * services.
+        */
+       int (*is_authenticated)(struct ceph_auth_client *ac);
+
+       /*
+        * build requests and process replies during monitor
+        * handshake.  if handle_reply returns -EAGAIN, we build
+        * another request.
+        */
+       int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+       int (*handle_reply)(struct ceph_auth_client *ac, int result,
+                           void *buf, void *end);
+
+       /*
+        * Create authorizer for connecting to a service, and verify
+        * the response to authenticate the service.
+        */
+       int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+                                struct ceph_authorizer **a,
+                                void **buf, size_t *len,
+                                void **reply_buf, size_t *reply_len);
+       int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+                                      struct ceph_authorizer *a, size_t len);
+       void (*destroy_authorizer)(struct ceph_auth_client *ac,
+                                  struct ceph_authorizer *a);
+       void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+                                     int peer_type);
+
+       /* reset when we (re)connect to a monitor */
+       void (*reset)(struct ceph_auth_client *ac);
+
+       void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+       u32 protocol;           /* CEPH_AUTH_* */
+       void *private;          /* for use by protocol implementation */
+       const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+
+       bool negotiating;       /* true if negotiating protocol */
+       const char *name;       /* entity name */
+       u64 global_id;          /* our unique id in system */
+       const char *secret;     /* our secret key */
+       unsigned want_keys;     /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+                                              const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+                                void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                                 void *buf, size_t len,
+                                 void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+                   void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644 (file)
index 0000000..b4ef6f0
--- /dev/null
@@ -0,0 +1,121 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       xi->starting = true;
+       xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+       kfree(ac->private);
+       ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       return !xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+                       void *buf, void *end)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       xi->starting = false;
+       return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+       struct ceph_auth_client *ac, int peer_type,
+       struct ceph_authorizer **a,
+       void **buf, size_t *len,
+       void **reply_buf, size_t *reply_len)
+{
+       struct ceph_auth_none_info *ai = ac->private;
+       struct ceph_none_authorizer *au = &ai->au;
+       void *p, *end;
+       int ret;
+
+       if (!ai->built_authorizer) {
+               p = au->buf;
+               end = p + sizeof(au->buf);
+               ceph_encode_8(&p, 1);
+               ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+               if (ret < 0)
+                       goto bad;
+               ceph_decode_need(&p, end, sizeof(u64), bad2);
+               ceph_encode_64(&p, ac->global_id);
+               au->buf_len = p - (void *)au->buf;
+               ai->built_authorizer = true;
+               dout("built authorizer len %d\n", au->buf_len);
+       }
+
+       *a = (struct ceph_authorizer *)au;
+       *buf = au->buf;
+       *len = au->buf_len;
+       *reply_buf = au->reply_buf;
+       *reply_len = sizeof(au->reply_buf);
+       return 0;
+
+bad2:
+       ret = -ERANGE;
+bad:
+       return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+                                     struct ceph_authorizer *a)
+{
+       /* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+       .reset = reset,
+       .destroy = destroy,
+       .is_authenticated = is_authenticated,
+       .handle_reply = handle_reply,
+       .create_authorizer = ceph_auth_none_create_authorizer,
+       .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi;
+
+       dout("ceph_auth_none_init %p\n", ac);
+       xi = kzalloc(sizeof(*xi), GFP_NOFS);
+       if (!xi)
+               return -ENOMEM;
+
+       xi->starting = true;
+       xi->built_authorizer = false;
+
+       ac->protocol = CEPH_AUTH_NONE;
+       ac->private = xi;
+       ac->ops = &ceph_auth_none_ops;
+       return 0;
+}
+
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644 (file)
index 0000000..56c0553
--- /dev/null
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include "auth.h"
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+       char buf[128];
+       int buf_len;
+       char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+       bool starting;
+       bool built_authorizer;
+       struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644 (file)
index 0000000..f031842
--- /dev/null
@@ -0,0 +1,656 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+
+struct kmem_cache *ceph_x_ticketbuf_cachep;
+
+#define TEMP_TICKET_BUF_LEN    256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+       int need;
+
+       ceph_x_validate_tickets(ac, &need);
+       dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+            ac->want_keys, need, xi->have_keys);
+       return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+                         void *ibuf, int ilen, void *obuf, size_t olen)
+{
+       struct ceph_x_encrypt_header head = {
+               .struct_v = 1,
+               .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+       };
+       size_t len = olen - sizeof(u32);
+       int ret;
+
+       ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+                           &head, sizeof(head), ibuf, ilen);
+       if (ret)
+               return ret;
+       ceph_encode_32(&obuf, len);
+       return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+                         void **p, void *end, void *obuf, size_t olen)
+{
+       struct ceph_x_encrypt_header head;
+       size_t head_len = sizeof(head);
+       int len, ret;
+
+       len = ceph_decode_32(p);
+       if (*p + len > end)
+               return -EINVAL;
+
+       dout("ceph_x_decrypt len %d\n", len);
+       ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+                           *p, len);
+       if (ret)
+               return ret;
+       if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+               return -EPERM;
+       *p += len;
+       return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+                                                int service)
+{
+       struct ceph_x_ticket_handler *th;
+       struct ceph_x_info *xi = ac->private;
+       struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+       while (*p) {
+               parent = *p;
+               th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+               if (service < th->service)
+                       p = &(*p)->rb_left;
+               else if (service > th->service)
+                       p = &(*p)->rb_right;
+               else
+                       return th;
+       }
+
+       /* add it */
+       th = kzalloc(sizeof(*th), GFP_NOFS);
+       if (!th)
+               return ERR_PTR(-ENOMEM);
+       th->service = service;
+       rb_link_node(&th->node, parent, p);
+       rb_insert_color(&th->node, &xi->ticket_handlers);
+       return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+                                 struct ceph_x_ticket_handler *th)
+{
+       struct ceph_x_info *xi = ac->private;
+
+       dout("remove_ticket_handler %p %d\n", th, th->service);
+       rb_erase(&th->node, &xi->ticket_handlers);
+       ceph_crypto_key_destroy(&th->session_key);
+       if (th->ticket_blob)
+               ceph_buffer_put(th->ticket_blob);
+       kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+                                   struct ceph_crypto_key *secret,
+                                   void *buf, void *end)
+{
+       struct ceph_x_info *xi = ac->private;
+       int num;
+       void *p = buf;
+       int ret;
+       char *dbuf;
+       char *ticket_buf;
+       u8 struct_v;
+
+       dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+       if (!dbuf)
+               return -ENOMEM;
+
+       ret = -ENOMEM;
+       ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+                                     GFP_NOFS | GFP_ATOMIC);
+       if (!ticket_buf)
+               goto out_dbuf;
+
+       ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+       struct_v = ceph_decode_8(&p);
+       if (struct_v != 1)
+               goto bad;
+       num = ceph_decode_32(&p);
+       dout("%d tickets\n", num);
+       while (num--) {
+               int type;
+               u8 struct_v;
+               struct ceph_x_ticket_handler *th;
+               void *dp, *dend;
+               int dlen;
+               char is_enc;
+               struct timespec validity;
+               struct ceph_crypto_key old_key;
+               void *tp, *tpend;
+
+               ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+               type = ceph_decode_32(&p);
+               dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+               struct_v = ceph_decode_8(&p);
+               if (struct_v != 1)
+                       goto bad;
+
+               th = get_ticket_handler(ac, type);
+               if (IS_ERR(th)) {
+                       ret = PTR_ERR(th);
+                       goto out;
+               }
+
+               /* blob for me */
+               dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+                                     TEMP_TICKET_BUF_LEN);
+               if (dlen <= 0) {
+                       ret = dlen;
+                       goto out;
+               }
+               dout(" decrypted %d bytes\n", dlen);
+               dend = dbuf + dlen;
+               dp = dbuf;
+
+               struct_v = ceph_decode_8(&dp);
+               if (struct_v != 1)
+                       goto bad;
+
+               memcpy(&old_key, &th->session_key, sizeof(old_key));
+               ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
+               if (ret)
+                       goto out;
+
+               ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
+               ceph_decode_timespec(&validity, &th->validity);
+               th->expires = get_seconds() + validity.tv_sec;
+               th->renew_after = th->expires - (validity.tv_sec / 4);
+               dout(" expires=%lu renew_after=%lu\n", th->expires,
+                    th->renew_after);
+
+               /* ticket blob for service */
+               ceph_decode_8_safe(&p, end, is_enc, bad);
+               tp = ticket_buf;
+               if (is_enc) {
+                       /* encrypted */
+                       dout(" encrypted ticket\n");
+                       dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+                                             TEMP_TICKET_BUF_LEN);
+                       if (dlen < 0) {
+                               ret = dlen;
+                               goto out;
+                       }
+                       dlen = ceph_decode_32(&tp);
+               } else {
+                       /* unencrypted */
+                       ceph_decode_32_safe(&p, end, dlen, bad);
+                       ceph_decode_need(&p, end, dlen, bad);
+                       ceph_decode_copy(&p, ticket_buf, dlen);
+               }
+               tpend = tp + dlen;
+               dout(" ticket blob is %d bytes\n", dlen);
+               ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+               struct_v = ceph_decode_8(&tp);
+               th->secret_id = ceph_decode_64(&tp);
+               ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
+               if (ret)
+                       goto out;
+               dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+                    type, ceph_entity_type_name(type), th->secret_id,
+                    (int)th->ticket_blob->vec.iov_len);
+               xi->have_keys |= th->service;
+       }
+
+       ret = 0;
+out:
+       kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+out_dbuf:
+       kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+       return ret;
+
+bad:
+       ret = -EINVAL;
+       goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+                                  struct ceph_x_ticket_handler *th,
+                                  struct ceph_x_authorizer *au)
+{
+       int len;
+       struct ceph_x_authorize_a *msg_a;
+       struct ceph_x_authorize_b msg_b;
+       void *p, *end;
+       int ret;
+       int ticket_blob_len =
+               (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+       dout("build_authorizer for %s %p\n",
+            ceph_entity_type_name(th->service), au);
+
+       len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
+               ticket_blob_len + 16;
+       dout("  need len %d\n", len);
+       if (au->buf && au->buf->alloc_len < len) {
+               ceph_buffer_put(au->buf);
+               au->buf = NULL;
+       }
+       if (!au->buf) {
+               au->buf = ceph_buffer_new(len, GFP_NOFS);
+               if (!au->buf)
+                       return -ENOMEM;
+       }
+       au->service = th->service;
+
+       msg_a = au->buf->vec.iov_base;
+       msg_a->struct_v = 1;
+       msg_a->global_id = cpu_to_le64(ac->global_id);
+       msg_a->service_id = cpu_to_le32(th->service);
+       msg_a->ticket_blob.struct_v = 1;
+       msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+       msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+       if (ticket_blob_len) {
+               memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+                      th->ticket_blob->vec.iov_len);
+       }
+       dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+            le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+       p = msg_a + 1;
+       p += ticket_blob_len;
+       end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+       get_random_bytes(&au->nonce, sizeof(au->nonce));
+       msg_b.struct_v = 1;
+       msg_b.nonce = cpu_to_le64(au->nonce);
+       ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+                            p, end - p);
+       if (ret < 0)
+               goto out_buf;
+       p += ret;
+       au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+       dout(" built authorizer nonce %llx len %d\n", au->nonce,
+            (int)au->buf->vec.iov_len);
+       return 0;
+
+out_buf:
+       ceph_buffer_put(au->buf);
+       au->buf = NULL;
+       return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+                               void **p, void *end)
+{
+       ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+       ceph_encode_8(p, 1);
+       ceph_encode_64(p, th->secret_id);
+       if (th->ticket_blob) {
+               const char *buf = th->ticket_blob->vec.iov_base;
+               u32 len = th->ticket_blob->vec.iov_len;
+
+               ceph_encode_32_safe(p, end, len, bad);
+               ceph_encode_copy_safe(p, end, buf, len, bad);
+       } else {
+               ceph_encode_32_safe(p, end, 0, bad);
+       }
+
+       return 0;
+bad:
+       return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+       int want = ac->want_keys;
+       struct ceph_x_info *xi = ac->private;
+       int service;
+
+       *pneed = ac->want_keys & ~(xi->have_keys);
+
+       for (service = 1; service <= want; service <<= 1) {
+               struct ceph_x_ticket_handler *th;
+
+               if (!(ac->want_keys & service))
+                       continue;
+
+               if (*pneed & service)
+                       continue;
+
+               th = get_ticket_handler(ac, service);
+
+               if (!th) {
+                       *pneed |= service;
+                       continue;
+               }
+
+               if (get_seconds() >= th->renew_after)
+                       *pneed |= service;
+               if (get_seconds() >= th->expires)
+                       xi->have_keys &= ~service;
+       }
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+                               void *buf, void *end)
+{
+       struct ceph_x_info *xi = ac->private;
+       int need;
+       struct ceph_x_request_header *head = buf;
+       int ret;
+       struct ceph_x_ticket_handler *th =
+               get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+       ceph_x_validate_tickets(ac, &need);
+
+       dout("build_request want %x have %x need %x\n",
+            ac->want_keys, xi->have_keys, need);
+
+       if (need & CEPH_ENTITY_TYPE_AUTH) {
+               struct ceph_x_authenticate *auth = (void *)(head + 1);
+               void *p = auth + 1;
+               struct ceph_x_challenge_blob tmp;
+               char tmp_enc[40];
+               u64 *u;
+
+               if (p > end)
+                       return -ERANGE;
+
+               dout(" get_auth_session_key\n");
+               head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+               /* encrypt and hash */
+               get_random_bytes(&auth->client_challenge, sizeof(u64));
+               tmp.client_challenge = auth->client_challenge;
+               tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+               ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+                                    tmp_enc, sizeof(tmp_enc));
+               if (ret < 0)
+                       return ret;
+
+               auth->struct_v = 1;
+               auth->key = 0;
+               for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+                       auth->key ^= *u;
+               dout(" server_challenge %llx client_challenge %llx key %llx\n",
+                    xi->server_challenge, le64_to_cpu(auth->client_challenge),
+                    le64_to_cpu(auth->key));
+
+               /* now encode the old ticket if exists */
+               ret = ceph_x_encode_ticket(th, &p, end);
+               if (ret < 0)
+                       return ret;
+
+               return p - buf;
+       }
+
+       if (need) {
+               void *p = head + 1;
+               struct ceph_x_service_ticket_request *req;
+
+               if (p > end)
+                       return -ERANGE;
+               head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+               BUG_ON(!th);
+               ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+               if (ret)
+                       return ret;
+               ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+                                xi->auth_authorizer.buf->vec.iov_len);
+
+               req = p;
+               req->keys = cpu_to_le32(need);
+               p += sizeof(*req);
+               return p - buf;
+       }
+
+       return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+                              void *buf, void *end)
+{
+       struct ceph_x_info *xi = ac->private;
+       struct ceph_x_reply_header *head = buf;
+       struct ceph_x_ticket_handler *th;
+       int len = end - buf;
+       int op;
+       int ret;
+
+       if (result)
+               return result;  /* XXX hmm? */
+
+       if (xi->starting) {
+               /* it's a hello */
+               struct ceph_x_server_challenge *sc = buf;
+
+               if (len != sizeof(*sc))
+                       return -EINVAL;
+               xi->server_challenge = le64_to_cpu(sc->server_challenge);
+               dout("handle_reply got server challenge %llx\n",
+                    xi->server_challenge);
+               xi->starting = false;
+               xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+               return -EAGAIN;
+       }
+
+       op = le32_to_cpu(head->op);
+       result = le32_to_cpu(head->result);
+       dout("handle_reply op %d result %d\n", op, result);
+       switch (op) {
+       case CEPHX_GET_AUTH_SESSION_KEY:
+               /* verify auth key */
+               ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+                                              buf + sizeof(*head), end);
+               break;
+
+       case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+               th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+               BUG_ON(!th);
+               ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+                                              buf + sizeof(*head), end);
+               break;
+
+       default:
+               return -EINVAL;
+       }
+       if (ret)
+               return ret;
+       if (ac->want_keys == xi->have_keys)
+               return 0;
+       return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+       struct ceph_auth_client *ac, int peer_type,
+       struct ceph_authorizer **a,
+       void **buf, size_t *len,
+       void **reply_buf, size_t *reply_len)
+{
+       struct ceph_x_authorizer *au;
+       struct ceph_x_ticket_handler *th;
+       int ret;
+
+       th = get_ticket_handler(ac, peer_type);
+       if (IS_ERR(th))
+               return PTR_ERR(th);
+
+       au = kzalloc(sizeof(*au), GFP_NOFS);
+       if (!au)
+               return -ENOMEM;
+
+       ret = ceph_x_build_authorizer(ac, th, au);
+       if (ret) {
+               kfree(au);
+               return ret;
+       }
+
+       *a = (struct ceph_authorizer *)au;
+       *buf = au->buf->vec.iov_base;
+       *len = au->buf->vec.iov_len;
+       *reply_buf = au->reply_buf;
+       *reply_len = sizeof(au->reply_buf);
+       return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+                                         struct ceph_authorizer *a, size_t len)
+{
+       struct ceph_x_authorizer *au = (void *)a;
+       struct ceph_x_ticket_handler *th;
+       int ret = 0;
+       struct ceph_x_authorize_reply reply;
+       void *p = au->reply_buf;
+       void *end = p + sizeof(au->reply_buf);
+
+       th = get_ticket_handler(ac, au->service);
+       if (!th)
+               return -EIO;  /* hrm! */
+       ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+       if (ret < 0)
+               return ret;
+       if (ret != sizeof(reply))
+               return -EPERM;
+
+       if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+               ret = -EPERM;
+       else
+               ret = 0;
+       dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+            au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+       return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+                                     struct ceph_authorizer *a)
+{
+       struct ceph_x_authorizer *au = (void *)a;
+
+       ceph_buffer_put(au->buf);
+       kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+
+       dout("reset\n");
+       xi->starting = true;
+       xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+       struct rb_node *p;
+
+       dout("ceph_x_destroy %p\n", ac);
+       ceph_crypto_key_destroy(&xi->secret);
+
+       while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+               struct ceph_x_ticket_handler *th =
+                       rb_entry(p, struct ceph_x_ticket_handler, node);
+               remove_ticket_handler(ac, th);
+       }
+
+       kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+
+       kfree(ac->private);
+       ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+                                  int peer_type)
+{
+       struct ceph_x_ticket_handler *th;
+
+       th = get_ticket_handler(ac, peer_type);
+       if (th && !IS_ERR(th))
+               remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+       .is_authenticated = ceph_x_is_authenticated,
+       .build_request = ceph_x_build_request,
+       .handle_reply = ceph_x_handle_reply,
+       .create_authorizer = ceph_x_create_authorizer,
+       .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+       .destroy_authorizer = ceph_x_destroy_authorizer,
+       .invalidate_authorizer = ceph_x_invalidate_authorizer,
+       .reset =  ceph_x_reset,
+       .destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi;
+       int ret;
+
+       dout("ceph_x_init %p\n", ac);
+       xi = kzalloc(sizeof(*xi), GFP_NOFS);
+       if (!xi)
+               return -ENOMEM;
+
+       ret = -ENOMEM;
+       ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
+                                     TEMP_TICKET_BUF_LEN, 8,
+                                     (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                     NULL);
+       if (!ceph_x_ticketbuf_cachep)
+               goto done_nomem;
+       ret = -EINVAL;
+       if (!ac->secret) {
+               pr_err("no secret set (for auth_x protocol)\n");
+               goto done_nomem;
+       }
+
+       ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+       if (ret)
+               goto done_nomem;
+
+       xi->starting = true;
+       xi->ticket_handlers = RB_ROOT;
+
+       ac->protocol = CEPH_AUTH_CEPHX;
+       ac->private = xi;
+       ac->ops = &ceph_x_ops;
+       return 0;
+
+done_nomem:
+       kfree(xi);
+       if (ceph_x_ticketbuf_cachep)
+               kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+       return ret;
+}
+
+
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644 (file)
index 0000000..ff6f818
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+       struct rb_node node;
+       unsigned service;
+
+       struct ceph_crypto_key session_key;
+       struct ceph_timespec validity;
+
+       u64 secret_id;
+       struct ceph_buffer *ticket_blob;
+
+       unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+       struct ceph_buffer *buf;
+       unsigned service;
+       u64 nonce;
+       char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+       struct ceph_crypto_key secret;
+
+       bool starting;
+       u64 server_challenge;
+
+       unsigned have_keys;
+       struct rb_root ticket_handlers;
+
+       struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644 (file)
index 0000000..671d305
--- /dev/null
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+       __u8 struct_v;
+       __le64 secret_id;
+       __le32 blob_len;
+       char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+       __le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+       __le16 op;
+       __le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+       __u8 struct_v;
+       __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+       __u8 struct_v;
+       __le64 client_challenge;
+       __le64 key;
+       /* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+       __u8 struct_v;
+       __le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+       __le64 server_challenge;
+       __le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+       __u8 struct_v;
+       __le64 global_id;
+       __le32 service_id;
+       struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+       __u8 struct_v;
+       __le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+       __u8 struct_v;
+       __le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+       __u8 struct_v;
+       __le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644 (file)
index 0000000..b98086c
--- /dev/null
@@ -0,0 +1,78 @@
+
+#include "ceph_debug.h"
+#include "buffer.h"
+#include "decode.h"
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+       struct ceph_buffer *b;
+
+       b = kmalloc(sizeof(*b), gfp);
+       if (!b)
+               return NULL;
+
+       b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+       if (b->vec.iov_base) {
+               b->is_vmalloc = false;
+       } else {
+               b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+               if (!b->vec.iov_base) {
+                       kfree(b);
+                       return NULL;
+               }
+               b->is_vmalloc = true;
+       }
+
+       kref_init(&b->kref);
+       b->alloc_len = len;
+       b->vec.iov_len = len;
+       dout("buffer_new %p\n", b);
+       return b;
+}
+
+void ceph_buffer_release(struct kref *kref)
+{
+       struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+       dout("buffer_release %p\n", b);
+       if (b->vec.iov_base) {
+               if (b->is_vmalloc)
+                       vfree(b->vec.iov_base);
+               else
+                       kfree(b->vec.iov_base);
+       }
+       kfree(b);
+}
+
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+       b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+       if (b->vec.iov_base) {
+               b->is_vmalloc = false;
+       } else {
+               b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+               b->is_vmalloc = true;
+       }
+       if (!b->vec.iov_base)
+               return -ENOMEM;
+       b->alloc_len = len;
+       b->vec.iov_len = len;
+       return 0;
+}
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+       size_t len;
+
+       ceph_decode_need(p, end, sizeof(u32), bad);
+       len = ceph_decode_32(p);
+       dout("decode_buffer len %d\n", (int)len);
+       ceph_decode_need(p, end, len, bad);
+       *b = ceph_buffer_new(len, GFP_NOFS);
+       if (!*b)
+               return -ENOMEM;
+       ceph_decode_copy(p, (*b)->vec.iov_base, len);
+       return 0;
+bad:
+       return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644 (file)
index 0000000..58d1901
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+       struct kref kref;
+       struct kvec vec;
+       size_t alloc_len;
+       bool is_vmalloc;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+       kref_get(&b->kref);
+       return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+       kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644 (file)
index 0000000..db122bb
--- /dev/null
@@ -0,0 +1,2927 @@
+#include "ceph_debug.h"
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+       if (c & CEPH_CAP_GSHARED)
+               *s++ = 's';
+       if (c & CEPH_CAP_GEXCL)
+               *s++ = 'x';
+       if (c & CEPH_CAP_GCACHE)
+               *s++ = 'c';
+       if (c & CEPH_CAP_GRD)
+               *s++ = 'r';
+       if (c & CEPH_CAP_GWR)
+               *s++ = 'w';
+       if (c & CEPH_CAP_GBUFFER)
+               *s++ = 'b';
+       if (c & CEPH_CAP_GLAZYIO)
+               *s++ = 'l';
+       return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+       int i;
+       char *s;
+       int c;
+
+       spin_lock(&cap_str_lock);
+       i = last_cap_str++;
+       if (last_cap_str == MAX_CAP_STR)
+               last_cap_str = 0;
+       spin_unlock(&cap_str_lock);
+
+       s = cap_str[i];
+
+       if (caps & CEPH_CAP_PIN)
+               *s++ = 'p';
+
+       c = (caps >> CEPH_CAP_SAUTH) & 3;
+       if (c) {
+               *s++ = 'A';
+               s = gcap_string(s, c);
+       }
+
+       c = (caps >> CEPH_CAP_SLINK) & 3;
+       if (c) {
+               *s++ = 'L';
+               s = gcap_string(s, c);
+       }
+
+       c = (caps >> CEPH_CAP_SXATTR) & 3;
+       if (c) {
+               *s++ = 'X';
+               s = gcap_string(s, c);
+       }
+
+       c = caps >> CEPH_CAP_SFILE;
+       if (c) {
+               *s++ = 'F';
+               s = gcap_string(s, c);
+       }
+
+       if (s == cap_str[i])
+               *s++ = '-';
+       *s = 0;
+       return cap_str[i];
+}
+
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+static int caps_min_count;          /* keep at least this many (unreserved) */
+
+void __init ceph_caps_init(void)
+{
+       INIT_LIST_HEAD(&caps_list);
+       spin_lock_init(&caps_list_lock);
+}
+
+void ceph_caps_finalize(void)
+{
+       struct ceph_cap *cap;
+
+       spin_lock(&caps_list_lock);
+       while (!list_empty(&caps_list)) {
+               cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+               list_del(&cap->caps_item);
+               kmem_cache_free(ceph_cap_cachep, cap);
+       }
+       caps_total_count = 0;
+       caps_avail_count = 0;
+       caps_use_count = 0;
+       caps_reserve_count = 0;
+       caps_min_count = 0;
+       spin_unlock(&caps_list_lock);
+}
+
+void ceph_adjust_min_caps(int delta)
+{
+       spin_lock(&caps_list_lock);
+       caps_min_count += delta;
+       BUG_ON(caps_min_count < 0);
+       spin_unlock(&caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+       int i;
+       struct ceph_cap *cap;
+       int have;
+       int alloc = 0;
+       LIST_HEAD(newcaps);
+       int ret = 0;
+
+       dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+       /* first reserve any caps that are already allocated */
+       spin_lock(&caps_list_lock);
+       if (caps_avail_count >= need)
+               have = need;
+       else
+               have = caps_avail_count;
+       caps_avail_count -= have;
+       caps_reserve_count += have;
+       BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+              caps_avail_count);
+       spin_unlock(&caps_list_lock);
+
+       for (i = have; i < need; i++) {
+               cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+               if (!cap) {
+                       ret = -ENOMEM;
+                       goto out_alloc_count;
+               }
+               list_add(&cap->caps_item, &newcaps);
+               alloc++;
+       }
+       BUG_ON(have + alloc != need);
+
+       spin_lock(&caps_list_lock);
+       caps_total_count += alloc;
+       caps_reserve_count += alloc;
+       list_splice(&newcaps, &caps_list);
+
+       BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+              caps_avail_count);
+       spin_unlock(&caps_list_lock);
+
+       ctx->count = need;
+       dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+            ctx, caps_total_count, caps_use_count, caps_reserve_count,
+            caps_avail_count);
+       return 0;
+
+out_alloc_count:
+       /* we didn't manage to reserve as much as we needed */
+       pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+                  ctx, need, have);
+       return ret;
+}
+
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+       dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+       if (ctx->count) {
+               spin_lock(&caps_list_lock);
+               BUG_ON(caps_reserve_count < ctx->count);
+               caps_reserve_count -= ctx->count;
+               caps_avail_count += ctx->count;
+               ctx->count = 0;
+               dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+                    caps_total_count, caps_use_count, caps_reserve_count,
+                    caps_avail_count);
+               BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+                      caps_avail_count);
+               spin_unlock(&caps_list_lock);
+       }
+       return 0;
+}
+
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+       struct ceph_cap *cap = NULL;
+
+       /* temporary, until we do something about cap import/export */
+       if (!ctx)
+               return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+
+       spin_lock(&caps_list_lock);
+       dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+            ctx, ctx->count, caps_total_count, caps_use_count,
+            caps_reserve_count, caps_avail_count);
+       BUG_ON(!ctx->count);
+       BUG_ON(ctx->count > caps_reserve_count);
+       BUG_ON(list_empty(&caps_list));
+
+       ctx->count--;
+       caps_reserve_count--;
+       caps_use_count++;
+
+       cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+       list_del(&cap->caps_item);
+
+       BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+              caps_avail_count);
+       spin_unlock(&caps_list_lock);
+       return cap;
+}
+
+void ceph_put_cap(struct ceph_cap *cap)
+{
+       spin_lock(&caps_list_lock);
+       dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+            cap, caps_total_count, caps_use_count,
+            caps_reserve_count, caps_avail_count);
+       caps_use_count--;
+       /*
+        * Keep some preallocated caps around (ceph_min_count), to
+        * avoid lots of free/alloc churn.
+        */
+       if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+               caps_total_count--;
+               kmem_cache_free(ceph_cap_cachep, cap);
+       } else {
+               caps_avail_count++;
+               list_add(&cap->caps_item, &caps_list);
+       }
+
+       BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+              caps_avail_count);
+       spin_unlock(&caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_client *client,
+                            int *total, int *avail, int *used, int *reserved,
+                            int *min)
+{
+       if (total)
+               *total = caps_total_count;
+       if (avail)
+               *avail = caps_avail_count;
+       if (used)
+               *used = caps_use_count;
+       if (reserved)
+               *reserved = caps_reserve_count;
+       if (min)
+               *min = caps_min_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+       struct ceph_cap *cap;
+       struct rb_node *n = ci->i_caps.rb_node;
+
+       while (n) {
+               cap = rb_entry(n, struct ceph_cap, ci_node);
+               if (mds < cap->mds)
+                       n = n->rb_left;
+               else if (mds > cap->mds)
+                       n = n->rb_right;
+               else
+                       return cap;
+       }
+       return NULL;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+       struct ceph_cap *cap;
+       int mds = -1;
+       struct rb_node *p;
+
+       /* prefer mds with WR|WRBUFFER|EXCL caps */
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               mds = cap->mds;
+               if (mseq)
+                       *mseq = cap->mseq;
+               if (cap->issued & (CEPH_CAP_FILE_WR |
+                                  CEPH_CAP_FILE_BUFFER |
+                                  CEPH_CAP_FILE_EXCL))
+                       break;
+       }
+       return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+       int mds;
+       spin_lock(&inode->i_lock);
+       mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+       spin_unlock(&inode->i_lock);
+       return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+                             struct ceph_cap *new)
+{
+       struct rb_node **p = &ci->i_caps.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_cap *cap = NULL;
+
+       while (*p) {
+               parent = *p;
+               cap = rb_entry(parent, struct ceph_cap, ci_node);
+               if (new->mds < cap->mds)
+                       p = &(*p)->rb_left;
+               else if (new->mds > cap->mds)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&new->ci_node, parent, p);
+       rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+                              struct ceph_inode_info *ci)
+{
+       struct ceph_mount_args *ma = mdsc->client->mount_args;
+
+       ci->i_hold_caps_min = round_jiffies(jiffies +
+                                           ma->caps_wanted_delay_min * HZ);
+       ci->i_hold_caps_max = round_jiffies(jiffies +
+                                           ma->caps_wanted_delay_max * HZ);
+       dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+            ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+                               struct ceph_inode_info *ci)
+{
+       __cap_set_timeouts(mdsc, ci);
+       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+            ci->i_ceph_flags, ci->i_hold_caps_max);
+       if (!mdsc->stopping) {
+               spin_lock(&mdsc->cap_delay_lock);
+               if (!list_empty(&ci->i_cap_delay_list)) {
+                       if (ci->i_ceph_flags & CEPH_I_FLUSH)
+                               goto no_change;
+                       list_del_init(&ci->i_cap_delay_list);
+               }
+               list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+               spin_unlock(&mdsc->cap_delay_lock);
+       }
+}
+
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+                                     struct ceph_inode_info *ci)
+{
+       dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+       spin_lock(&mdsc->cap_delay_lock);
+       ci->i_ceph_flags |= CEPH_I_FLUSH;
+       if (!list_empty(&ci->i_cap_delay_list))
+               list_del_init(&ci->i_cap_delay_list);
+       list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+       spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+                              struct ceph_inode_info *ci)
+{
+       dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+       if (list_empty(&ci->i_cap_delay_list))
+               return;
+       spin_lock(&mdsc->cap_delay_lock);
+       list_del_init(&ci->i_cap_delay_list);
+       spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+                             unsigned issued)
+{
+       unsigned had = __ceph_caps_issued(ci, NULL);
+
+       /*
+        * Each time we receive FILE_CACHE anew, we increment
+        * i_rdcache_gen.
+        */
+       if ((issued & CEPH_CAP_FILE_CACHE) &&
+           (had & CEPH_CAP_FILE_CACHE) == 0)
+               ci->i_rdcache_gen++;
+
+       /*
+        * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+        * don't know what happened to this directory while we didn't
+        * have the cap.
+        */
+       if ((issued & CEPH_CAP_FILE_SHARED) &&
+           (had & CEPH_CAP_FILE_SHARED) == 0) {
+               ci->i_shared_gen++;
+               if (S_ISDIR(ci->vfs_inode.i_mode)) {
+                       dout(" marking %p NOT complete\n", &ci->vfs_inode);
+                       ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+               }
+       }
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+                struct ceph_mds_session *session, u64 cap_id,
+                int fmode, unsigned issued, unsigned wanted,
+                unsigned seq, unsigned mseq, u64 realmino, int flags,
+                struct ceph_cap_reservation *caps_reservation)
+{
+       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_cap *new_cap = NULL;
+       struct ceph_cap *cap;
+       int mds = session->s_mds;
+       int actual_wanted;
+
+       dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+            session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+       /*
+        * If we are opening the file, include file mode wanted bits
+        * in wanted.
+        */
+       if (fmode >= 0)
+               wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+       spin_lock(&inode->i_lock);
+       cap = __get_cap_for_mds(ci, mds);
+       if (!cap) {
+               if (new_cap) {
+                       cap = new_cap;
+                       new_cap = NULL;
+               } else {
+                       spin_unlock(&inode->i_lock);
+                       new_cap = get_cap(caps_reservation);
+                       if (new_cap == NULL)
+                               return -ENOMEM;
+                       goto retry;
+               }
+
+               cap->issued = 0;
+               cap->implemented = 0;
+               cap->mds = mds;
+               cap->mds_wanted = 0;
+
+               cap->ci = ci;
+               __insert_cap_node(ci, cap);
+
+               /* clear out old exporting info?  (i.e. on cap import) */
+               if (ci->i_cap_exporting_mds == mds) {
+                       ci->i_cap_exporting_issued = 0;
+                       ci->i_cap_exporting_mseq = 0;
+                       ci->i_cap_exporting_mds = -1;
+               }
+
+               /* add to session cap list */
+               cap->session = session;
+               spin_lock(&session->s_cap_lock);
+               list_add_tail(&cap->session_caps, &session->s_caps);
+               session->s_nr_caps++;
+               spin_unlock(&session->s_cap_lock);
+       }
+
+       if (!ci->i_snap_realm) {
+               /*
+                * add this inode to the appropriate snap realm
+                */
+               struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+                                                              realmino);
+               if (realm) {
+                       ceph_get_snap_realm(mdsc, realm);
+                       spin_lock(&realm->inodes_with_caps_lock);
+                       ci->i_snap_realm = realm;
+                       list_add(&ci->i_snap_realm_item,
+                                &realm->inodes_with_caps);
+                       spin_unlock(&realm->inodes_with_caps_lock);
+               } else {
+                       pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+                              realmino);
+               }
+       }
+
+       __check_cap_issue(ci, cap, issued);
+
+       /*
+        * If we are issued caps we don't want, or the mds' wanted
+        * value appears to be off, queue a check so we'll release
+        * later and/or update the mds wanted value.
+        */
+       actual_wanted = __ceph_caps_wanted(ci);
+       if ((wanted & ~actual_wanted) ||
+           (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+               dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+                    ceph_cap_string(issued), ceph_cap_string(wanted),
+                    ceph_cap_string(actual_wanted));
+               __cap_delay_requeue(mdsc, ci);
+       }
+
+       if (flags & CEPH_CAP_FLAG_AUTH)
+               ci->i_auth_cap = cap;
+       else if (ci->i_auth_cap == cap)
+               ci->i_auth_cap = NULL;
+
+       dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+            inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+            ceph_cap_string(issued|cap->issued), seq, mds);
+       cap->cap_id = cap_id;
+       cap->issued = issued;
+       cap->implemented |= issued;
+       cap->mds_wanted |= wanted;
+       cap->seq = seq;
+       cap->issue_seq = seq;
+       cap->mseq = mseq;
+       cap->cap_gen = session->s_cap_gen;
+
+       if (fmode >= 0)
+               __ceph_get_fmode(ci, fmode);
+       spin_unlock(&inode->i_lock);
+       wake_up(&ci->i_cap_wq);
+       return 0;
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+       unsigned long ttl;
+       u32 gen;
+
+       spin_lock(&cap->session->s_cap_lock);
+       gen = cap->session->s_cap_gen;
+       ttl = cap->session->s_cap_ttl;
+       spin_unlock(&cap->session->s_cap_lock);
+
+       if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+               dout("__cap_is_valid %p cap %p issued %s "
+                    "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+                    cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+               return 0;
+       }
+
+       return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+       int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+       struct ceph_cap *cap;
+       struct rb_node *p;
+
+       if (implemented)
+               *implemented = 0;
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               if (!__cap_is_valid(cap))
+                       continue;
+               dout("__ceph_caps_issued %p cap %p issued %s\n",
+                    &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+               have |= cap->issued;
+               if (implemented)
+                       *implemented |= cap->implemented;
+       }
+       return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+       int have = ci->i_snap_caps;
+       struct ceph_cap *cap;
+       struct rb_node *p;
+
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               if (cap == ocap)
+                       continue;
+               if (!__cap_is_valid(cap))
+                       continue;
+               have |= cap->issued;
+       }
+       return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+       struct ceph_mds_session *s = cap->session;
+
+       spin_lock(&s->s_cap_lock);
+       if (s->s_cap_iterator == NULL) {
+               dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+                    s->s_mds);
+               list_move_tail(&cap->session_caps, &s->s_caps);
+       } else {
+               dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+                    &cap->ci->vfs_inode, cap, s->s_mds);
+       }
+       spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+       struct ceph_cap *cap;
+       struct rb_node *p;
+       int have = ci->i_snap_caps;
+
+       if ((have & mask) == mask) {
+               dout("__ceph_caps_issued_mask %p snap issued %s"
+                    " (mask %s)\n", &ci->vfs_inode,
+                    ceph_cap_string(have),
+                    ceph_cap_string(mask));
+               return 1;
+       }
+
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               if (!__cap_is_valid(cap))
+                       continue;
+               if ((cap->issued & mask) == mask) {
+                       dout("__ceph_caps_issued_mask %p cap %p issued %s"
+                            " (mask %s)\n", &ci->vfs_inode, cap,
+                            ceph_cap_string(cap->issued),
+                            ceph_cap_string(mask));
+                       if (touch)
+                               __touch_cap(cap);
+                       return 1;
+               }
+
+               /* does a combination of caps satisfy mask? */
+               have |= cap->issued;
+               if ((have & mask) == mask) {
+                       dout("__ceph_caps_issued_mask %p combo issued %s"
+                            " (mask %s)\n", &ci->vfs_inode,
+                            ceph_cap_string(cap->issued),
+                            ceph_cap_string(mask));
+                       if (touch) {
+                               struct rb_node *q;
+
+                               /* touch this + preceeding caps */
+                               __touch_cap(cap);
+                               for (q = rb_first(&ci->i_caps); q != p;
+                                    q = rb_next(q)) {
+                                       cap = rb_entry(q, struct ceph_cap,
+                                                      ci_node);
+                                       if (!__cap_is_valid(cap))
+                                               continue;
+                                       __touch_cap(cap);
+                               }
+                       }
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+       struct inode *inode = &ci->vfs_inode;
+       struct ceph_cap *cap;
+       struct rb_node *p;
+       int ret = 0;
+
+       spin_lock(&inode->i_lock);
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               if (__cap_is_valid(cap) &&
+                   (cap->implemented & ~cap->issued & mask)) {
+                       ret = 1;
+                       break;
+               }
+       }
+       spin_unlock(&inode->i_lock);
+       dout("ceph_caps_revoking %p %s = %d\n", inode,
+            ceph_cap_string(mask), ret);
+       return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+       int used = 0;
+       if (ci->i_pin_ref)
+               used |= CEPH_CAP_PIN;
+       if (ci->i_rd_ref)
+               used |= CEPH_CAP_FILE_RD;
+       if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+               used |= CEPH_CAP_FILE_CACHE;
+       if (ci->i_wr_ref)
+               used |= CEPH_CAP_FILE_WR;
+       if (ci->i_wrbuffer_ref)
+               used |= CEPH_CAP_FILE_BUFFER;
+       return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+       int want = 0;
+       int mode;
+       for (mode = 0; mode < 4; mode++)
+               if (ci->i_nr_by_mode[mode])
+                       want |= ceph_caps_for_mode(mode);
+       return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+       struct ceph_cap *cap;
+       struct rb_node *p;
+       int mds_wanted = 0;
+
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               if (!__cap_is_valid(cap))
+                       continue;
+               mds_wanted |= cap->mds_wanted;
+       }
+       return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+       return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+       struct ceph_mds_session *session = cap->session;
+       struct ceph_inode_info *ci = cap->ci;
+       struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+
+       dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+       /* remove from inode list */
+       rb_erase(&cap->ci_node, &ci->i_caps);
+       cap->ci = NULL;
+       if (ci->i_auth_cap == cap)
+               ci->i_auth_cap = NULL;
+
+       /* remove from session list */
+       spin_lock(&session->s_cap_lock);
+       if (session->s_cap_iterator == cap) {
+               /* not yet, we are iterating over this very cap */
+               dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+                    cap, cap->session);
+       } else {
+               list_del_init(&cap->session_caps);
+               session->s_nr_caps--;
+               cap->session = NULL;
+       }
+       spin_unlock(&session->s_cap_lock);
+
+       if (cap->session == NULL)
+               ceph_put_cap(cap);
+
+       if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+               struct ceph_snap_realm *realm = ci->i_snap_realm;
+               spin_lock(&realm->inodes_with_caps_lock);
+               list_del_init(&ci->i_snap_realm_item);
+               ci->i_snap_realm_counter++;
+               ci->i_snap_realm = NULL;
+               spin_unlock(&realm->inodes_with_caps_lock);
+               ceph_put_snap_realm(mdsc, realm);
+       }
+       if (!__ceph_is_any_real_caps(ci))
+               __cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+                       u64 ino, u64 cid, int op,
+                       int caps, int wanted, int dirty,
+                       u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+                       u64 size, u64 max_size,
+                       struct timespec *mtime, struct timespec *atime,
+                       u64 time_warp_seq,
+                       uid_t uid, gid_t gid, mode_t mode,
+                       u64 xattr_version,
+                       struct ceph_buffer *xattrs_buf,
+                       u64 follows)
+{
+       struct ceph_mds_caps *fc;
+       struct ceph_msg *msg;
+
+       dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+            " seq %u/%u mseq %u follows %lld size %llu/%llu"
+            " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+            cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+            ceph_cap_string(dirty),
+            seq, issue_seq, mseq, follows, size, max_size,
+            xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+       if (IS_ERR(msg))
+               return PTR_ERR(msg);
+
+       msg->hdr.tid = cpu_to_le64(flush_tid);
+
+       fc = msg->front.iov_base;
+       memset(fc, 0, sizeof(*fc));
+
+       fc->cap_id = cpu_to_le64(cid);
+       fc->op = cpu_to_le32(op);
+       fc->seq = cpu_to_le32(seq);
+       fc->issue_seq = cpu_to_le32(issue_seq);
+       fc->migrate_seq = cpu_to_le32(mseq);
+       fc->caps = cpu_to_le32(caps);
+       fc->wanted = cpu_to_le32(wanted);
+       fc->dirty = cpu_to_le32(dirty);
+       fc->ino = cpu_to_le64(ino);
+       fc->snap_follows = cpu_to_le64(follows);
+
+       fc->size = cpu_to_le64(size);
+       fc->max_size = cpu_to_le64(max_size);
+       if (mtime)
+               ceph_encode_timespec(&fc->mtime, mtime);
+       if (atime)
+               ceph_encode_timespec(&fc->atime, atime);
+       fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+       fc->uid = cpu_to_le32(uid);
+       fc->gid = cpu_to_le32(gid);
+       fc->mode = cpu_to_le32(mode);
+
+       fc->xattr_version = cpu_to_le64(xattr_version);
+       if (xattrs_buf) {
+               msg->middle = ceph_buffer_get(xattrs_buf);
+               fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+               msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+       }
+
+       ceph_con_send(&session->s_con, msg);
+       return 0;
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct rb_node *p;
+
+       p = rb_first(&ci->i_caps);
+       while (p) {
+               struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+               struct ceph_mds_session *session = cap->session;
+               struct ceph_msg *msg;
+               struct ceph_mds_cap_release *head;
+               struct ceph_mds_cap_item *item;
+
+               spin_lock(&session->s_cap_lock);
+               BUG_ON(!session->s_num_cap_releases);
+               msg = list_first_entry(&session->s_cap_releases,
+                                      struct ceph_msg, list_head);
+
+               dout(" adding %p release to mds%d msg %p (%d left)\n",
+                    inode, session->s_mds, msg, session->s_num_cap_releases);
+
+               BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+               head = msg->front.iov_base;
+               head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+               item = msg->front.iov_base + msg->front.iov_len;
+               item->ino = cpu_to_le64(ceph_ino(inode));
+               item->cap_id = cpu_to_le64(cap->cap_id);
+               item->migrate_seq = cpu_to_le32(cap->mseq);
+               item->seq = cpu_to_le32(cap->issue_seq);
+
+               session->s_num_cap_releases--;
+
+               msg->front.iov_len += sizeof(*item);
+               if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                       dout(" release msg %p full\n", msg);
+                       list_move_tail(&msg->list_head,
+                                      &session->s_cap_releases_done);
+               } else {
+                       dout(" release msg %p at %d/%d (%d)\n", msg,
+                            (int)le32_to_cpu(head->num),
+                            (int)CEPH_CAPS_PER_RELEASE,
+                            (int)msg->front.iov_len);
+               }
+               spin_unlock(&session->s_cap_lock);
+               p = rb_next(p);
+               __ceph_remove_cap(cap);
+       }
+}
+
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+                     int op, int used, int want, int retain, int flushing,
+                     unsigned *pflush_tid)
+       __releases(cap->ci->vfs_inode->i_lock)
+{
+       struct ceph_inode_info *ci = cap->ci;
+       struct inode *inode = &ci->vfs_inode;
+       u64 cap_id = cap->cap_id;
+       int held, revoking, dropping, keep;
+       u64 seq, issue_seq, mseq, time_warp_seq, follows;
+       u64 size, max_size;
+       struct timespec mtime, atime;
+       int wake = 0;
+       mode_t mode;
+       uid_t uid;
+       gid_t gid;
+       struct ceph_mds_session *session;
+       u64 xattr_version = 0;
+       int delayed = 0;
+       u64 flush_tid = 0;
+       int i;
+       int ret;
+
+       held = cap->issued | cap->implemented;
+       revoking = cap->implemented & ~cap->issued;
+       retain &= ~revoking;
+       dropping = cap->issued & ~retain;
+
+       dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+            inode, cap, cap->session,
+            ceph_cap_string(held), ceph_cap_string(held & retain),
+            ceph_cap_string(revoking));
+       BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+       session = cap->session;
+
+       /* don't release wanted unless we've waited a bit. */
+       if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+           time_before(jiffies, ci->i_hold_caps_min)) {
+               dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+                    ceph_cap_string(cap->issued),
+                    ceph_cap_string(cap->issued & retain),
+                    ceph_cap_string(cap->mds_wanted),
+                    ceph_cap_string(want));
+               want |= cap->mds_wanted;
+               retain |= cap->issued;
+               delayed = 1;
+       }
+       ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+       cap->issued &= retain;  /* drop bits we don't want */
+       if (cap->implemented & ~cap->issued) {
+               /*
+                * Wake up any waiters on wanted -> needed transition.
+                * This is due to the weird transition from buffered
+                * to sync IO... we need to flush dirty pages _before_
+                * allowing sync writes to avoid reordering.
+                */
+               wake = 1;
+       }
+       cap->implemented &= cap->issued | used;
+       cap->mds_wanted = want;
+
+       if (flushing) {
+               /*
+                * assign a tid for flush operations so we can avoid
+                * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+                * clean type races.  track latest tid for every bit
+                * so we can handle flush AxFw, flush Fw, and have the
+                * first ack clean Ax.
+                */
+               flush_tid = ++ci->i_cap_flush_last_tid;
+               if (pflush_tid)
+                       *pflush_tid = flush_tid;
+               dout(" cap_flush_tid %d\n", (int)flush_tid);
+               for (i = 0; i < CEPH_CAP_BITS; i++)
+                       if (flushing & (1 << i))
+                               ci->i_cap_flush_tid[i] = flush_tid;
+       }
+
+       keep = cap->implemented;
+       seq = cap->seq;
+       issue_seq = cap->issue_seq;
+       mseq = cap->mseq;
+       size = inode->i_size;
+       ci->i_reported_size = size;
+       max_size = ci->i_wanted_max_size;
+       ci->i_requested_max_size = max_size;
+       mtime = inode->i_mtime;
+       atime = inode->i_atime;
+       time_warp_seq = ci->i_time_warp_seq;
+       follows = ci->i_snap_realm->cached_context->seq;
+       uid = inode->i_uid;
+       gid = inode->i_gid;
+       mode = inode->i_mode;
+
+       if (dropping & CEPH_CAP_XATTR_EXCL) {
+               __ceph_build_xattrs_blob(ci);
+               xattr_version = ci->i_xattrs.version + 1;
+       }
+
+       spin_unlock(&inode->i_lock);
+
+       ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+               op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+               size, max_size, &mtime, &atime, time_warp_seq,
+               uid, gid, mode,
+               xattr_version,
+               (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+               follows);
+       if (ret < 0) {
+               dout("error sending cap msg, must requeue %p\n", inode);
+               delayed = 1;
+       }
+
+       if (wake)
+               wake_up(&ci->i_cap_wq);
+
+       return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+                       struct ceph_mds_session **psession)
+{
+       struct inode *inode = &ci->vfs_inode;
+       int mds;
+       struct ceph_cap_snap *capsnap;
+       u32 mseq;
+       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+                                                   session->s_mutex */
+       u64 next_follows = 0;  /* keep track of how far we've gotten through the
+                            i_cap_snaps list, and skip these entries next time
+                            around to avoid an infinite loop */
+
+       if (psession)
+               session = *psession;
+
+       dout("__flush_snaps %p\n", inode);
+retry:
+       list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+               /* avoid an infiniute loop after retry */
+               if (capsnap->follows < next_follows)
+                       continue;
+               /*
+                * we need to wait for sync writes to complete and for dirty
+                * pages to be written out.
+                */
+               if (capsnap->dirty_pages || capsnap->writing)
+                       continue;
+
+               /* pick mds, take s_mutex */
+               mds = __ceph_get_cap_mds(ci, &mseq);
+               if (session && session->s_mds != mds) {
+                       dout("oops, wrong session %p mutex\n", session);
+                       mutex_unlock(&session->s_mutex);
+                       ceph_put_mds_session(session);
+                       session = NULL;
+               }
+               if (!session) {
+                       spin_unlock(&inode->i_lock);
+                       mutex_lock(&mdsc->mutex);
+                       session = __ceph_lookup_mds_session(mdsc, mds);
+                       mutex_unlock(&mdsc->mutex);
+                       if (session) {
+                               dout("inverting session/ino locks on %p\n",
+                                    session);
+                               mutex_lock(&session->s_mutex);
+                       }
+                       /*
+                        * if session == NULL, we raced against a cap
+                        * deletion.  retry, and we'll get a better
+                        * @mds value next time.
+                        */
+                       spin_lock(&inode->i_lock);
+                       goto retry;
+               }
+
+               capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+               atomic_inc(&capsnap->nref);
+               if (!list_empty(&capsnap->flushing_item))
+                       list_del_init(&capsnap->flushing_item);
+               list_add_tail(&capsnap->flushing_item,
+                             &session->s_cap_snaps_flushing);
+               spin_unlock(&inode->i_lock);
+
+               dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+                    inode, capsnap, next_follows, capsnap->size);
+               send_cap_msg(session, ceph_vino(inode).ino, 0,
+                            CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+                            capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+                            capsnap->size, 0,
+                            &capsnap->mtime, &capsnap->atime,
+                            capsnap->time_warp_seq,
+                            capsnap->uid, capsnap->gid, capsnap->mode,
+                            0, NULL,
+                            capsnap->follows);
+
+               next_follows = capsnap->follows + 1;
+               ceph_put_cap_snap(capsnap);
+
+               spin_lock(&inode->i_lock);
+               goto retry;
+       }
+
+       /* we flushed them all; remove this inode from the queue */
+       spin_lock(&mdsc->snap_flush_lock);
+       list_del_init(&ci->i_snap_flush_item);
+       spin_unlock(&mdsc->snap_flush_lock);
+
+       if (psession)
+               *psession = session;
+       else if (session) {
+               mutex_unlock(&session->s_mutex);
+               ceph_put_mds_session(session);
+       }
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+       struct inode *inode = &ci->vfs_inode;
+
+       spin_lock(&inode->i_lock);
+       __ceph_flush_snaps(ci, NULL);
+       spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+       struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+       struct inode *inode = &ci->vfs_inode;
+       int was = ci->i_dirty_caps;
+       int dirty = 0;
+
+       dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+            ceph_cap_string(mask), ceph_cap_string(was),
+            ceph_cap_string(was | mask));
+       ci->i_dirty_caps |= mask;
+       if (was == 0) {
+               dout(" inode %p now dirty\n", &ci->vfs_inode);
+               BUG_ON(!list_empty(&ci->i_dirty_item));
+               spin_lock(&mdsc->cap_dirty_lock);
+               list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+               spin_unlock(&mdsc->cap_dirty_lock);
+               if (ci->i_flushing_caps == 0) {
+                       igrab(inode);
+                       dirty |= I_DIRTY_SYNC;
+               }
+       }
+       BUG_ON(list_empty(&ci->i_dirty_item));
+       if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+           (mask & CEPH_CAP_FILE_BUFFER))
+               dirty |= I_DIRTY_DATASYNC;
+       if (dirty)
+               __mark_inode_dirty(inode, dirty);
+       __cap_delay_requeue(mdsc, ci);
+}
+
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+                                struct ceph_mds_session *session)
+{
+       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int flushing;
+
+       BUG_ON(ci->i_dirty_caps == 0);
+       BUG_ON(list_empty(&ci->i_dirty_item));
+
+       flushing = ci->i_dirty_caps;
+       dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+            ceph_cap_string(flushing),
+            ceph_cap_string(ci->i_flushing_caps),
+            ceph_cap_string(ci->i_flushing_caps | flushing));
+       ci->i_flushing_caps |= flushing;
+       ci->i_dirty_caps = 0;
+       dout(" inode %p now !dirty\n", inode);
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       list_del_init(&ci->i_dirty_item);
+
+       ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+       if (list_empty(&ci->i_flushing_item)) {
+               list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+               mdsc->num_cap_flushing++;
+               dout(" inode %p now flushing seq %lld\n", inode,
+                    ci->i_cap_flush_seq);
+       } else {
+               list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+               dout(" inode %p now flushing (more) seq %lld\n", inode,
+                    ci->i_cap_flush_seq);
+       }
+       spin_unlock(&mdsc->cap_dirty_lock);
+
+       return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+       struct page *page = find_get_page(mapping, 0);
+
+       if (!page)
+               return 1;
+
+       put_page(page);
+       return 0;
+}
+
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       u32 invalidating_gen = ci->i_rdcache_gen;
+
+       spin_unlock(&inode->i_lock);
+       invalidate_mapping_pages(&inode->i_data, 0, -1);
+       spin_lock(&inode->i_lock);
+
+       if (mapping_is_empty(&inode->i_data) &&
+           invalidating_gen == ci->i_rdcache_gen) {
+               /* success. */
+               dout("try_nonblocking_invalidate %p success\n", inode);
+               ci->i_rdcache_gen = 0;
+               ci->i_rdcache_revoking = 0;
+               return 0;
+       }
+       dout("try_nonblocking_invalidate %p failed\n", inode);
+       return -1;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+                    struct ceph_mds_session *session)
+{
+       struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct inode *inode = &ci->vfs_inode;
+       struct ceph_cap *cap;
+       int file_wanted, used;
+       int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+       int drop_session_lock = session ? 0 : 1;
+       int issued, implemented, want, retain, revoking, flushing = 0;
+       int mds = -1;   /* keep track of how far we've gone through i_caps list
+                          to avoid an infinite loop on retry */
+       struct rb_node *p;
+       int tried_invalidate = 0;
+       int delayed = 0, sent = 0, force_requeue = 0, num;
+       int queue_invalidate = 0;
+       int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+       /* if we are unmounting, flush any unused caps immediately. */
+       if (mdsc->stopping)
+               is_delayed = 1;
+
+       spin_lock(&inode->i_lock);
+
+       if (ci->i_ceph_flags & CEPH_I_FLUSH)
+               flags |= CHECK_CAPS_FLUSH;
+
+       /* flush snaps first time around only */
+       if (!list_empty(&ci->i_cap_snaps))
+               __ceph_flush_snaps(ci, &session);
+       goto retry_locked;
+retry:
+       spin_lock(&inode->i_lock);
+retry_locked:
+       file_wanted = __ceph_caps_file_wanted(ci);
+       used = __ceph_caps_used(ci);
+       want = file_wanted | used;
+       issued = __ceph_caps_issued(ci, &implemented);
+       revoking = implemented & ~issued;
+
+       retain = want | CEPH_CAP_PIN;
+       if (!mdsc->stopping && inode->i_nlink > 0) {
+               if (want) {
+                       retain |= CEPH_CAP_ANY;       /* be greedy */
+               } else {
+                       retain |= CEPH_CAP_ANY_SHARED;
+                       /*
+                        * keep RD only if we didn't have the file open RW,
+                        * because then the mds would revoke it anyway to
+                        * journal max_size=0.
+                        */
+                       if (ci->i_max_size == 0)
+                               retain |= CEPH_CAP_ANY_RD;
+               }
+       }
+
+       dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+            " issued %s revoking %s retain %s %s%s%s\n", inode,
+            ceph_cap_string(file_wanted),
+            ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+            ceph_cap_string(ci->i_flushing_caps),
+            ceph_cap_string(issued), ceph_cap_string(revoking),
+            ceph_cap_string(retain),
+            (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+            (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+            (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+       /*
+        * If we no longer need to hold onto old our caps, and we may
+        * have cached pages, but don't want them, then try to invalidate.
+        * If we fail, it's because pages are locked.... try again later.
+        */
+       if ((!is_delayed || mdsc->stopping) &&
+           ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+           ci->i_rdcache_gen &&                     /* may have cached pages */
+           (file_wanted == 0 ||                     /* no open files */
+            (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+           !tried_invalidate) {
+               dout("check_caps trying to invalidate on %p\n", inode);
+               if (try_nonblocking_invalidate(inode) < 0) {
+                       if (revoking & CEPH_CAP_FILE_CACHE) {
+                               dout("check_caps queuing invalidate\n");
+                               queue_invalidate = 1;
+                               ci->i_rdcache_revoking = ci->i_rdcache_gen;
+                       } else {
+                               dout("check_caps failed to invalidate pages\n");
+                               /* we failed to invalidate pages.  check these
+                                  caps again later. */
+                               force_requeue = 1;
+                               __cap_set_timeouts(mdsc, ci);
+                       }
+               }
+               tried_invalidate = 1;
+               goto retry_locked;
+       }
+
+       num = 0;
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               cap = rb_entry(p, struct ceph_cap, ci_node);
+               num++;
+
+               /* avoid looping forever */
+               if (mds >= cap->mds ||
+                   ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+                       continue;
+
+               /* NOTE: no side-effects allowed, until we take s_mutex */
+
+               revoking = cap->implemented & ~cap->issued;
+               if (revoking)
+                       dout(" mds%d revoking %s\n", cap->mds,
+                            ceph_cap_string(revoking));
+
+               if (cap == ci->i_auth_cap &&
+                   (cap->issued & CEPH_CAP_FILE_WR)) {
+                       /* request larger max_size from MDS? */
+                       if (ci->i_wanted_max_size > ci->i_max_size &&
+                           ci->i_wanted_max_size > ci->i_requested_max_size) {
+                               dout("requesting new max_size\n");
+                               goto ack;
+                       }
+
+                       /* approaching file_max? */
+                       if ((inode->i_size << 1) >= ci->i_max_size &&
+                           (ci->i_reported_size << 1) < ci->i_max_size) {
+                               dout("i_size approaching max_size\n");
+                               goto ack;
+                       }
+               }
+               /* flush anything dirty? */
+               if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+                   ci->i_dirty_caps) {
+                       dout("flushing dirty caps\n");
+                       goto ack;
+               }
+
+               /* completed revocation? going down and there are no caps? */
+               if (revoking && (revoking & used) == 0) {
+                       dout("completed revocation of %s\n",
+                            ceph_cap_string(cap->implemented & ~cap->issued));
+                       goto ack;
+               }
+
+               /* want more caps from mds? */
+               if (want & ~(cap->mds_wanted | cap->issued))
+                       goto ack;
+
+               /* things we might delay */
+               if ((cap->issued & ~retain) == 0 &&
+                   cap->mds_wanted == want)
+                       continue;     /* nope, all good */
+
+               if (is_delayed)
+                       goto ack;
+
+               /* delay? */
+               if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+                   time_before(jiffies, ci->i_hold_caps_max)) {
+                       dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+                            ceph_cap_string(cap->issued),
+                            ceph_cap_string(cap->issued & retain),
+                            ceph_cap_string(cap->mds_wanted),
+                            ceph_cap_string(want));
+                       delayed++;
+                       continue;
+               }
+
+ack:
+               if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+                       dout(" skipping %p I_NOFLUSH set\n", inode);
+                       continue;
+               }
+
+               if (session && session != cap->session) {
+                       dout("oops, wrong session %p mutex\n", session);
+                       mutex_unlock(&session->s_mutex);
+                       session = NULL;
+               }
+               if (!session) {
+                       session = cap->session;
+                       if (mutex_trylock(&session->s_mutex) == 0) {
+                               dout("inverting session/ino locks on %p\n",
+                                    session);
+                               spin_unlock(&inode->i_lock);
+                               if (took_snap_rwsem) {
+                                       up_read(&mdsc->snap_rwsem);
+                                       took_snap_rwsem = 0;
+                               }
+                               mutex_lock(&session->s_mutex);
+                               goto retry;
+                       }
+               }
+               /* take snap_rwsem after session mutex */
+               if (!took_snap_rwsem) {
+                       if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+                               dout("inverting snap/in locks on %p\n",
+                                    inode);
+                               spin_unlock(&inode->i_lock);
+                               down_read(&mdsc->snap_rwsem);
+                               took_snap_rwsem = 1;
+                               goto retry;
+                       }
+                       took_snap_rwsem = 1;
+               }
+
+               if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+                       flushing = __mark_caps_flushing(inode, session);
+
+               mds = cap->mds;  /* remember mds, so we don't repeat */
+               sent++;
+
+               /* __send_cap drops i_lock */
+               delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+                                     retain, flushing, NULL);
+               goto retry; /* retake i_lock and restart our cap scan. */
+       }
+
+       /*
+        * Reschedule delayed caps release if we delayed anything,
+        * otherwise cancel.
+        */
+       if (delayed && is_delayed)
+               force_requeue = 1;   /* __send_cap delayed release; requeue */
+       if (!delayed && !is_delayed)
+               __cap_delay_cancel(mdsc, ci);
+       else if (!is_delayed || force_requeue)
+               __cap_delay_requeue(mdsc, ci);
+
+       spin_unlock(&inode->i_lock);
+
+       if (queue_invalidate)
+               ceph_queue_invalidate(inode);
+
+       if (session && drop_session_lock)
+               mutex_unlock(&session->s_mutex);
+       if (took_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+                         unsigned *flush_tid)
+{
+       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int unlock_session = session ? 0 : 1;
+       int flushing = 0;
+
+retry:
+       spin_lock(&inode->i_lock);
+       if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+               dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+               goto out;
+       }
+       if (ci->i_dirty_caps && ci->i_auth_cap) {
+               struct ceph_cap *cap = ci->i_auth_cap;
+               int used = __ceph_caps_used(ci);
+               int want = __ceph_caps_wanted(ci);
+               int delayed;
+
+               if (!session) {
+                       spin_unlock(&inode->i_lock);
+                       session = cap->session;
+                       mutex_lock(&session->s_mutex);
+                       goto retry;
+               }
+               BUG_ON(session != cap->session);
+               if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+                       goto out;
+
+               flushing = __mark_caps_flushing(inode, session);
+
+               /* __send_cap drops i_lock */
+               delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+                                    cap->issued | cap->implemented, flushing,
+                                    flush_tid);
+               if (!delayed)
+                       goto out_unlocked;
+
+               spin_lock(&inode->i_lock);
+               __cap_delay_requeue(mdsc, ci);
+       }
+out:
+       spin_unlock(&inode->i_lock);
+out_unlocked:
+       if (session && unlock_session)
+               mutex_unlock(&session->s_mutex);
+       return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int dirty, i, ret = 1;
+
+       spin_lock(&inode->i_lock);
+       dirty = __ceph_caps_dirty(ci);
+       for (i = 0; i < CEPH_CAP_BITS; i++)
+               if ((ci->i_flushing_caps & (1 << i)) &&
+                   ci->i_cap_flush_tid[i] <= tid) {
+                       /* still flushing this bit */
+                       ret = 0;
+                       break;
+               }
+       spin_unlock(&inode->i_lock);
+       return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct list_head *head = &ci->i_unsafe_writes;
+       struct ceph_osd_request *req;
+       u64 last_tid;
+
+       spin_lock(&ci->i_unsafe_lock);
+       if (list_empty(head))
+               goto out;
+
+       /* set upper bound as _last_ entry in chain */
+       req = list_entry(head->prev, struct ceph_osd_request,
+                        r_unsafe_item);
+       last_tid = req->r_tid;
+
+       do {
+               ceph_osdc_get_request(req);
+               spin_unlock(&ci->i_unsafe_lock);
+               dout("sync_write_wait on tid %llu (until %llu)\n",
+                    req->r_tid, last_tid);
+               wait_for_completion(&req->r_safe_completion);
+               spin_lock(&ci->i_unsafe_lock);
+               ceph_osdc_put_request(req);
+
+               /*
+                * from here on look at first entry in chain, since we
+                * only want to wait for anything older than last_tid
+                */
+               if (list_empty(head))
+                       break;
+               req = list_entry(head->next, struct ceph_osd_request,
+                                r_unsafe_item);
+       } while (req->r_tid < last_tid);
+out:
+       spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       unsigned flush_tid;
+       int ret;
+       int dirty;
+
+       dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+       sync_write_wait(inode);
+
+       ret = filemap_write_and_wait(inode->i_mapping);
+       if (ret < 0)
+               return ret;
+
+       dirty = try_flush_caps(inode, NULL, &flush_tid);
+       dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+       /*
+        * only wait on non-file metadata writeback (the mds
+        * can recover size and mtime, so we don't need to
+        * wait for that)
+        */
+       if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+               dout("fsync waiting for flush_tid %u\n", flush_tid);
+               ret = wait_event_interruptible(ci->i_cap_wq,
+                                      caps_are_flushed(inode, flush_tid));
+       }
+
+       dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+       return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       unsigned flush_tid;
+       int err = 0;
+       int dirty;
+       int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+       dout("write_inode %p wait=%d\n", inode, wait);
+       if (wait) {
+               dirty = try_flush_caps(inode, NULL, &flush_tid);
+               if (dirty)
+                       err = wait_event_interruptible(ci->i_cap_wq,
+                                      caps_are_flushed(inode, flush_tid));
+       } else {
+               struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+               spin_lock(&inode->i_lock);
+               if (__ceph_caps_dirty(ci))
+                       __cap_delay_requeue_front(mdsc, ci);
+               spin_unlock(&inode->i_lock);
+       }
+       return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+                                  struct ceph_mds_session *session)
+{
+       struct ceph_cap_snap *capsnap;
+
+       dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+       list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+                           flushing_item) {
+               struct ceph_inode_info *ci = capsnap->ci;
+               struct inode *inode = &ci->vfs_inode;
+               struct ceph_cap *cap;
+
+               spin_lock(&inode->i_lock);
+               cap = ci->i_auth_cap;
+               if (cap && cap->session == session) {
+                       dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+                            cap, capsnap);
+                       __ceph_flush_snaps(ci, &session);
+               } else {
+                       pr_err("%p auth cap %p not mds%d ???\n", inode,
+                              cap, session->s_mds);
+                       spin_unlock(&inode->i_lock);
+               }
+       }
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                            struct ceph_mds_session *session)
+{
+       struct ceph_inode_info *ci;
+
+       kick_flushing_capsnaps(mdsc, session);
+
+       dout("kick_flushing_caps mds%d\n", session->s_mds);
+       list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+               struct inode *inode = &ci->vfs_inode;
+               struct ceph_cap *cap;
+               int delayed = 0;
+
+               spin_lock(&inode->i_lock);
+               cap = ci->i_auth_cap;
+               if (cap && cap->session == session) {
+                       dout("kick_flushing_caps %p cap %p %s\n", inode,
+                            cap, ceph_cap_string(ci->i_flushing_caps));
+                       delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                            __ceph_caps_used(ci),
+                                            __ceph_caps_wanted(ci),
+                                            cap->issued | cap->implemented,
+                                            ci->i_flushing_caps, NULL);
+                       if (delayed) {
+                               spin_lock(&inode->i_lock);
+                               __cap_delay_requeue(mdsc, ci);
+                               spin_unlock(&inode->i_lock);
+                       }
+               } else {
+                       pr_err("%p auth cap %p not mds%d ???\n", inode,
+                              cap, session->s_mds);
+                       spin_unlock(&inode->i_lock);
+               }
+       }
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+       if (got & CEPH_CAP_PIN)
+               ci->i_pin_ref++;
+       if (got & CEPH_CAP_FILE_RD)
+               ci->i_rd_ref++;
+       if (got & CEPH_CAP_FILE_CACHE)
+               ci->i_rdcache_ref++;
+       if (got & CEPH_CAP_FILE_WR)
+               ci->i_wr_ref++;
+       if (got & CEPH_CAP_FILE_BUFFER) {
+               if (ci->i_wrbuffer_ref == 0)
+                       igrab(&ci->vfs_inode);
+               ci->i_wrbuffer_ref++;
+               dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+                    &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+       }
+}
+
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+                           int *got, loff_t endoff, int *check_max, int *err)
+{
+       struct inode *inode = &ci->vfs_inode;
+       int ret = 0;
+       int have, implemented;
+       int file_wanted;
+
+       dout("get_cap_refs %p need %s want %s\n", inode,
+            ceph_cap_string(need), ceph_cap_string(want));
+       spin_lock(&inode->i_lock);
+
+       /* make sure file is actually open */
+       file_wanted = __ceph_caps_file_wanted(ci);
+       if ((file_wanted & need) == 0) {
+               dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+                    ceph_cap_string(need), ceph_cap_string(file_wanted));
+               *err = -EBADF;
+               ret = 1;
+               goto out;
+       }
+
+       if (need & CEPH_CAP_FILE_WR) {
+               if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+                       dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+                            inode, endoff, ci->i_max_size);
+                       if (endoff > ci->i_wanted_max_size) {
+                               *check_max = 1;
+                               ret = 1;
+                       }
+                       goto out;
+               }
+               /*
+                * If a sync write is in progress, we must wait, so that we
+                * can get a final snapshot value for size+mtime.
+                */
+               if (__ceph_have_pending_cap_snap(ci)) {
+                       dout("get_cap_refs %p cap_snap_pending\n", inode);
+                       goto out;
+               }
+       }
+       have = __ceph_caps_issued(ci, &implemented);
+
+       /*
+        * disallow writes while a truncate is pending
+        */
+       if (ci->i_truncate_pending)
+               have &= ~CEPH_CAP_FILE_WR;
+
+       if ((have & need) == need) {
+               /*
+                * Look at (implemented & ~have & not) so that we keep waiting
+                * on transition from wanted -> needed caps.  This is needed
+                * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+                * going before a prior buffered writeback happens.
+                */
+               int not = want & ~(have & need);
+               int revoking = implemented & ~have;
+               dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+                    inode, ceph_cap_string(have), ceph_cap_string(not),
+                    ceph_cap_string(revoking));
+               if ((revoking & not) == 0) {
+                       *got = need | (have & want);
+                       __take_cap_refs(ci, *got);
+                       ret = 1;
+               }
+       } else {
+               dout("get_cap_refs %p have %s needed %s\n", inode,
+                    ceph_cap_string(have), ceph_cap_string(need));
+       }
+out:
+       spin_unlock(&inode->i_lock);
+       dout("get_cap_refs %p ret %d got %s\n", inode,
+            ret, ceph_cap_string(*got));
+       return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int check = 0;
+
+       /* do we need to explicitly request a larger max_size? */
+       spin_lock(&inode->i_lock);
+       if ((endoff >= ci->i_max_size ||
+            endoff > (inode->i_size << 1)) &&
+           endoff > ci->i_wanted_max_size) {
+               dout("write %p at large endoff %llu, req max_size\n",
+                    inode, endoff);
+               ci->i_wanted_max_size = endoff;
+               check = 1;
+       }
+       spin_unlock(&inode->i_lock);
+       if (check)
+               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+                 loff_t endoff)
+{
+       int check_max, ret, err;
+
+retry:
+       if (endoff > 0)
+               check_max_size(&ci->vfs_inode, endoff);
+       check_max = 0;
+       err = 0;
+       ret = wait_event_interruptible(ci->i_cap_wq,
+                                      try_get_cap_refs(ci, need, want,
+                                                       got, endoff,
+                                                       &check_max, &err));
+       if (err)
+               ret = err;
+       if (check_max)
+               goto retry;
+       return ret;
+}
+
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+       spin_lock(&ci->vfs_inode.i_lock);
+       __take_cap_refs(ci, caps);
+       spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+       struct inode *inode = &ci->vfs_inode;
+       int last = 0, put = 0, flushsnaps = 0, wake = 0;
+       struct ceph_cap_snap *capsnap;
+
+       spin_lock(&inode->i_lock);
+       if (had & CEPH_CAP_PIN)
+               --ci->i_pin_ref;
+       if (had & CEPH_CAP_FILE_RD)
+               if (--ci->i_rd_ref == 0)
+                       last++;
+       if (had & CEPH_CAP_FILE_CACHE)
+               if (--ci->i_rdcache_ref == 0)
+                       last++;
+       if (had & CEPH_CAP_FILE_BUFFER) {
+               if (--ci->i_wrbuffer_ref == 0) {
+                       last++;
+                       put++;
+               }
+               dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+                    inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+       }
+       if (had & CEPH_CAP_FILE_WR)
+               if (--ci->i_wr_ref == 0) {
+                       last++;
+                       if (!list_empty(&ci->i_cap_snaps)) {
+                               capsnap = list_first_entry(&ci->i_cap_snaps,
+                                                    struct ceph_cap_snap,
+                                                    ci_item);
+                               if (capsnap->writing) {
+                                       capsnap->writing = 0;
+                                       flushsnaps =
+                                               __ceph_finish_cap_snap(ci,
+                                                                      capsnap);
+                                       wake = 1;
+                               }
+                       }
+               }
+       spin_unlock(&inode->i_lock);
+
+       dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+            last ? "last" : "");
+
+       if (last && !flushsnaps)
+               ceph_check_caps(ci, 0, NULL);
+       else if (flushsnaps)
+               ceph_flush_snaps(ci);
+       if (wake)
+               wake_up(&ci->i_cap_wq);
+       if (put)
+               iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+                               struct ceph_snap_context *snapc)
+{
+       struct inode *inode = &ci->vfs_inode;
+       int last = 0;
+       int last_snap = 0;
+       int found = 0;
+       struct ceph_cap_snap *capsnap = NULL;
+
+       spin_lock(&inode->i_lock);
+       ci->i_wrbuffer_ref -= nr;
+       last = !ci->i_wrbuffer_ref;
+
+       if (ci->i_head_snapc == snapc) {
+               ci->i_wrbuffer_ref_head -= nr;
+               if (!ci->i_wrbuffer_ref_head) {
+                       ceph_put_snap_context(ci->i_head_snapc);
+                       ci->i_head_snapc = NULL;
+               }
+               dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+                    inode,
+                    ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+                    ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+                    last ? " LAST" : "");
+       } else {
+               list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                       if (capsnap->context == snapc) {
+                               found = 1;
+                               capsnap->dirty_pages -= nr;
+                               last_snap = !capsnap->dirty_pages;
+                               break;
+                       }
+               }
+               BUG_ON(!found);
+               dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+                    " snap %lld %d/%d -> %d/%d %s%s\n",
+                    inode, capsnap, capsnap->context->seq,
+                    ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+                    ci->i_wrbuffer_ref, capsnap->dirty_pages,
+                    last ? " (wrbuffer last)" : "",
+                    last_snap ? " (capsnap last)" : "");
+       }
+
+       spin_unlock(&inode->i_lock);
+
+       if (last) {
+               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+               iput(inode);
+       } else if (last_snap) {
+               ceph_flush_snaps(ci);
+               wake_up(&ci->i_cap_wq);
+       }
+}
+
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex.
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+                           struct ceph_mds_session *session,
+                           struct ceph_cap *cap,
+                           struct ceph_buffer *xattr_buf)
+       __releases(inode->i_lock)
+
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int mds = session->s_mds;
+       int seq = le32_to_cpu(grant->seq);
+       int newcaps = le32_to_cpu(grant->caps);
+       int issued, implemented, used, wanted, dirty;
+       u64 size = le64_to_cpu(grant->size);
+       u64 max_size = le64_to_cpu(grant->max_size);
+       struct timespec mtime, atime, ctime;
+       int reply = 0;
+       int wake = 0;
+       int writeback = 0;
+       int revoked_rdcache = 0;
+       int queue_invalidate = 0;
+
+       dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+            inode, cap, mds, seq, ceph_cap_string(newcaps));
+       dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+               inode->i_size);
+
+       /*
+        * If CACHE is being revoked, and we have no dirty buffers,
+        * try to invalidate (once).  (If there are dirty buffers, we
+        * will invalidate _after_ writeback.)
+        */
+       if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+           !ci->i_wrbuffer_ref) {
+               if (try_nonblocking_invalidate(inode) == 0) {
+                       revoked_rdcache = 1;
+               } else {
+                       /* there were locked pages.. invalidate later
+                          in a separate thread. */
+                       if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+                               queue_invalidate = 1;
+                               ci->i_rdcache_revoking = ci->i_rdcache_gen;
+                       }
+               }
+       }
+
+       /* side effects now are allowed */
+
+       issued = __ceph_caps_issued(ci, &implemented);
+       issued |= implemented | __ceph_caps_dirty(ci);
+
+       cap->cap_gen = session->s_cap_gen;
+
+       __check_cap_issue(ci, cap, newcaps);
+
+       if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+               inode->i_mode = le32_to_cpu(grant->mode);
+               inode->i_uid = le32_to_cpu(grant->uid);
+               inode->i_gid = le32_to_cpu(grant->gid);
+               dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+                    inode->i_uid, inode->i_gid);
+       }
+
+       if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+               inode->i_nlink = le32_to_cpu(grant->nlink);
+
+       if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+               int len = le32_to_cpu(grant->xattr_len);
+               u64 version = le64_to_cpu(grant->xattr_version);
+
+               if (version > ci->i_xattrs.version) {
+                       dout(" got new xattrs v%llu on %p len %d\n",
+                            version, inode, len);
+                       if (ci->i_xattrs.blob)
+                               ceph_buffer_put(ci->i_xattrs.blob);
+                       ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+                       ci->i_xattrs.version = version;
+               }
+       }
+
+       /* size/ctime/mtime/atime? */
+       ceph_fill_file_size(inode, issued,
+                           le32_to_cpu(grant->truncate_seq),
+                           le64_to_cpu(grant->truncate_size), size);
+       ceph_decode_timespec(&mtime, &grant->mtime);
+       ceph_decode_timespec(&atime, &grant->atime);
+       ceph_decode_timespec(&ctime, &grant->ctime);
+       ceph_fill_file_time(inode, issued,
+                           le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+                           &atime);
+
+       /* max size increase? */
+       if (max_size != ci->i_max_size) {
+               dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+               ci->i_max_size = max_size;
+               if (max_size >= ci->i_wanted_max_size) {
+                       ci->i_wanted_max_size = 0;  /* reset */
+                       ci->i_requested_max_size = 0;
+               }
+               wake = 1;
+       }
+
+       /* check cap bits */
+       wanted = __ceph_caps_wanted(ci);
+       used = __ceph_caps_used(ci);
+       dirty = __ceph_caps_dirty(ci);
+       dout(" my wanted = %s, used = %s, dirty %s\n",
+            ceph_cap_string(wanted),
+            ceph_cap_string(used),
+            ceph_cap_string(dirty));
+       if (wanted != le32_to_cpu(grant->wanted)) {
+               dout("mds wanted %s -> %s\n",
+                    ceph_cap_string(le32_to_cpu(grant->wanted)),
+                    ceph_cap_string(wanted));
+               grant->wanted = cpu_to_le32(wanted);
+       }
+
+       cap->seq = seq;
+
+       /* file layout may have changed */
+       ci->i_layout = grant->layout;
+
+       /* revocation, grant, or no-op? */
+       if (cap->issued & ~newcaps) {
+               dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+                    ceph_cap_string(newcaps));
+               if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+                       writeback = 1; /* will delay ack */
+               else if (dirty & ~newcaps)
+                       reply = 1;     /* initiate writeback in check_caps */
+               else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+                          revoked_rdcache)
+                       reply = 2;     /* send revoke ack in check_caps */
+               cap->issued = newcaps;
+       } else if (cap->issued == newcaps) {
+               dout("caps unchanged: %s -> %s\n",
+                    ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+       } else {
+               dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+                    ceph_cap_string(newcaps));
+               cap->issued = newcaps;
+               cap->implemented |= newcaps; /* add bits only, to
+                                             * avoid stepping on a
+                                             * pending revocation */
+               wake = 1;
+       }
+
+       spin_unlock(&inode->i_lock);
+       if (writeback)
+               /*
+                * queue inode for writeback: we can't actually call
+                * filemap_write_and_wait, etc. from message handler
+                * context.
+                */
+               ceph_queue_writeback(inode);
+       if (queue_invalidate)
+               ceph_queue_invalidate(inode);
+       if (wake)
+               wake_up(&ci->i_cap_wq);
+       return reply;
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+                                struct ceph_mds_caps *m,
+                                struct ceph_mds_session *session,
+                                struct ceph_cap *cap)
+       __releases(inode->i_lock)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+       unsigned seq = le32_to_cpu(m->seq);
+       int dirty = le32_to_cpu(m->dirty);
+       int cleaned = 0;
+       int drop = 0;
+       int i;
+
+       for (i = 0; i < CEPH_CAP_BITS; i++)
+               if ((dirty & (1 << i)) &&
+                   flush_tid == ci->i_cap_flush_tid[i])
+                       cleaned |= 1 << i;
+
+       dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+            " flushing %s -> %s\n",
+            inode, session->s_mds, seq, ceph_cap_string(dirty),
+            ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+            ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+       if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+               goto out;
+
+       ci->i_flushing_caps &= ~cleaned;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       if (ci->i_flushing_caps == 0) {
+               list_del_init(&ci->i_flushing_item);
+               if (!list_empty(&session->s_cap_flushing))
+                       dout(" mds%d still flushing cap on %p\n",
+                            session->s_mds,
+                            &list_entry(session->s_cap_flushing.next,
+                                        struct ceph_inode_info,
+                                        i_flushing_item)->vfs_inode);
+               mdsc->num_cap_flushing--;
+               wake_up(&mdsc->cap_flushing_wq);
+               dout(" inode %p now !flushing\n", inode);
+
+               if (ci->i_dirty_caps == 0) {
+                       dout(" inode %p now clean\n", inode);
+                       BUG_ON(!list_empty(&ci->i_dirty_item));
+                       drop = 1;
+               } else {
+                       BUG_ON(list_empty(&ci->i_dirty_item));
+               }
+       }
+       spin_unlock(&mdsc->cap_dirty_lock);
+       wake_up(&ci->i_cap_wq);
+
+out:
+       spin_unlock(&inode->i_lock);
+       if (drop)
+               iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+                                    struct ceph_mds_caps *m,
+                                    struct ceph_mds_session *session)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       u64 follows = le64_to_cpu(m->snap_follows);
+       struct ceph_cap_snap *capsnap;
+       int drop = 0;
+
+       dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+            inode, ci, session->s_mds, follows);
+
+       spin_lock(&inode->i_lock);
+       list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+               if (capsnap->follows == follows) {
+                       if (capsnap->flush_tid != flush_tid) {
+                               dout(" cap_snap %p follows %lld tid %lld !="
+                                    " %lld\n", capsnap, follows,
+                                    flush_tid, capsnap->flush_tid);
+                               break;
+                       }
+                       WARN_ON(capsnap->dirty_pages || capsnap->writing);
+                       dout(" removing cap_snap %p follows %lld\n",
+                            capsnap, follows);
+                       ceph_put_snap_context(capsnap->context);
+                       list_del(&capsnap->ci_item);
+                       list_del(&capsnap->flushing_item);
+                       ceph_put_cap_snap(capsnap);
+                       drop = 1;
+                       break;
+               } else {
+                       dout(" skipping cap_snap %p follows %lld\n",
+                            capsnap, capsnap->follows);
+               }
+       }
+       spin_unlock(&inode->i_lock);
+       if (drop)
+               iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+                            struct ceph_mds_caps *trunc,
+                            struct ceph_mds_session *session)
+       __releases(inode->i_lock)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int mds = session->s_mds;
+       int seq = le32_to_cpu(trunc->seq);
+       u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+       u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+       u64 size = le64_to_cpu(trunc->size);
+       int implemented = 0;
+       int dirty = __ceph_caps_dirty(ci);
+       int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+       int queue_trunc = 0;
+
+       issued |= implemented | dirty;
+
+       dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+            inode, mds, seq, truncate_size, truncate_seq);
+       queue_trunc = ceph_fill_file_size(inode, issued,
+                                         truncate_seq, truncate_size, size);
+       spin_unlock(&inode->i_lock);
+
+       if (queue_trunc)
+               ceph_queue_vmtruncate(inode);
+}
+
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+                             struct ceph_mds_session *session)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int mds = session->s_mds;
+       unsigned mseq = le32_to_cpu(ex->migrate_seq);
+       struct ceph_cap *cap = NULL, *t;
+       struct rb_node *p;
+       int remember = 1;
+
+       dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+            inode, ci, mds, mseq);
+
+       spin_lock(&inode->i_lock);
+
+       /* make sure we haven't seen a higher mseq */
+       for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+               t = rb_entry(p, struct ceph_cap, ci_node);
+               if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+                       dout(" higher mseq on cap from mds%d\n",
+                            t->session->s_mds);
+                       remember = 0;
+               }
+               if (t->session->s_mds == mds)
+                       cap = t;
+       }
+
+       if (cap) {
+               if (remember) {
+                       /* make note */
+                       ci->i_cap_exporting_mds = mds;
+                       ci->i_cap_exporting_mseq = mseq;
+                       ci->i_cap_exporting_issued = cap->issued;
+               }
+               __ceph_remove_cap(cap);
+       } else {
+               WARN_ON(!cap);
+       }
+
+       spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+                             struct inode *inode, struct ceph_mds_caps *im,
+                             struct ceph_mds_session *session,
+                             void *snaptrace, int snaptrace_len)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int mds = session->s_mds;
+       unsigned issued = le32_to_cpu(im->caps);
+       unsigned wanted = le32_to_cpu(im->wanted);
+       unsigned seq = le32_to_cpu(im->seq);
+       unsigned mseq = le32_to_cpu(im->migrate_seq);
+       u64 realmino = le64_to_cpu(im->realm);
+       u64 cap_id = le64_to_cpu(im->cap_id);
+
+       if (ci->i_cap_exporting_mds >= 0 &&
+           ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+               dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+                    " - cleared exporting from mds%d\n",
+                    inode, ci, mds, mseq,
+                    ci->i_cap_exporting_mds);
+               ci->i_cap_exporting_issued = 0;
+               ci->i_cap_exporting_mseq = 0;
+               ci->i_cap_exporting_mds = -1;
+       } else {
+               dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+                    inode, ci, mds, mseq);
+       }
+
+       down_write(&mdsc->snap_rwsem);
+       ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+                              false);
+       downgrade_write(&mdsc->snap_rwsem);
+       ceph_add_cap(inode, session, cap_id, -1,
+                    issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+                    NULL /* no caps context */);
+       try_flush_caps(inode, session, NULL);
+       up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+                     struct ceph_msg *msg)
+{
+       struct ceph_mds_client *mdsc = session->s_mdsc;
+       struct super_block *sb = mdsc->client->sb;
+       struct inode *inode;
+       struct ceph_cap *cap;
+       struct ceph_mds_caps *h;
+       int mds = session->s_mds;
+       int op;
+       u32 seq;
+       struct ceph_vino vino;
+       u64 cap_id;
+       u64 size, max_size;
+       u64 tid;
+       int check_caps = 0;
+       void *snaptrace;
+       int r;
+
+       dout("handle_caps from mds%d\n", mds);
+
+       /* decode */
+       tid = le64_to_cpu(msg->hdr.tid);
+       if (msg->front.iov_len < sizeof(*h))
+               goto bad;
+       h = msg->front.iov_base;
+       snaptrace = h + 1;
+       op = le32_to_cpu(h->op);
+       vino.ino = le64_to_cpu(h->ino);
+       vino.snap = CEPH_NOSNAP;
+       cap_id = le64_to_cpu(h->cap_id);
+       seq = le32_to_cpu(h->seq);
+       size = le64_to_cpu(h->size);
+       max_size = le64_to_cpu(h->max_size);
+
+       mutex_lock(&session->s_mutex);
+       session->s_seq++;
+       dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+            (unsigned)seq);
+
+       /* lookup ino */
+       inode = ceph_find_inode(sb, vino);
+       dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+            vino.snap, inode);
+       if (!inode) {
+               dout(" i don't have ino %llx\n", vino.ino);
+               goto done;
+       }
+
+       /* these will work even if we don't have a cap yet */
+       switch (op) {
+       case CEPH_CAP_OP_FLUSHSNAP_ACK:
+               handle_cap_flushsnap_ack(inode, tid, h, session);
+               goto done;
+
+       case CEPH_CAP_OP_EXPORT:
+               handle_cap_export(inode, h, session);
+               goto done;
+
+       case CEPH_CAP_OP_IMPORT:
+               handle_cap_import(mdsc, inode, h, session,
+                                 snaptrace, le32_to_cpu(h->snap_trace_len));
+               check_caps = 1; /* we may have sent a RELEASE to the old auth */
+               goto done;
+       }
+
+       /* the rest require a cap */
+       spin_lock(&inode->i_lock);
+       cap = __get_cap_for_mds(ceph_inode(inode), mds);
+       if (!cap) {
+               dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+                    inode, ceph_ino(inode), ceph_snap(inode), mds);
+               spin_unlock(&inode->i_lock);
+               goto done;
+       }
+
+       /* note that each of these drops i_lock for us */
+       switch (op) {
+       case CEPH_CAP_OP_REVOKE:
+       case CEPH_CAP_OP_GRANT:
+               r = handle_cap_grant(inode, h, session, cap, msg->middle);
+               if (r == 1)
+                       ceph_check_caps(ceph_inode(inode),
+                                       CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+                                       session);
+               else if (r == 2)
+                       ceph_check_caps(ceph_inode(inode),
+                                       CHECK_CAPS_NODELAY,
+                                       session);
+               break;
+
+       case CEPH_CAP_OP_FLUSH_ACK:
+               handle_cap_flush_ack(inode, tid, h, session, cap);
+               break;
+
+       case CEPH_CAP_OP_TRUNC:
+               handle_cap_trunc(inode, h, session);
+               break;
+
+       default:
+               spin_unlock(&inode->i_lock);
+               pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+                      ceph_cap_op_name(op));
+       }
+
+done:
+       mutex_unlock(&session->s_mutex);
+
+       if (check_caps)
+               ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
+       if (inode)
+               iput(inode);
+       return;
+
+bad:
+       pr_err("ceph_handle_caps: corrupt message\n");
+       ceph_msg_dump(msg);
+       return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+       struct ceph_inode_info *ci;
+       int flags = CHECK_CAPS_NODELAY;
+
+       dout("check_delayed_caps\n");
+       while (1) {
+               spin_lock(&mdsc->cap_delay_lock);
+               if (list_empty(&mdsc->cap_delay_list))
+                       break;
+               ci = list_first_entry(&mdsc->cap_delay_list,
+                                     struct ceph_inode_info,
+                                     i_cap_delay_list);
+               if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+                   time_before(jiffies, ci->i_hold_caps_max))
+                       break;
+               list_del_init(&ci->i_cap_delay_list);
+               spin_unlock(&mdsc->cap_delay_lock);
+               dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+               ceph_check_caps(ci, flags, NULL);
+       }
+       spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+       struct ceph_inode_info *ci, *nci = NULL;
+       struct inode *inode, *ninode = NULL;
+       struct list_head *p, *n;
+
+       dout("flush_dirty_caps\n");
+       spin_lock(&mdsc->cap_dirty_lock);
+       list_for_each_safe(p, n, &mdsc->cap_dirty) {
+               if (nci) {
+                       ci = nci;
+                       inode = ninode;
+                       ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+                       dout("flush_dirty_caps inode %p (was next inode)\n",
+                            inode);
+               } else {
+                       ci = list_entry(p, struct ceph_inode_info,
+                                       i_dirty_item);
+                       inode = igrab(&ci->vfs_inode);
+                       BUG_ON(!inode);
+                       dout("flush_dirty_caps inode %p\n", inode);
+               }
+               if (n != &mdsc->cap_dirty) {
+                       nci = list_entry(n, struct ceph_inode_info,
+                                        i_dirty_item);
+                       ninode = igrab(&nci->vfs_inode);
+                       BUG_ON(!ninode);
+                       nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+                       dout("flush_dirty_caps next inode %p, noflush\n",
+                            ninode);
+               } else {
+                       nci = NULL;
+                       ninode = NULL;
+               }
+               spin_unlock(&mdsc->cap_dirty_lock);
+               if (inode) {
+                       ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+                                       NULL);
+                       iput(inode);
+               }
+               spin_lock(&mdsc->cap_dirty_lock);
+       }
+       spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+       struct inode *inode = &ci->vfs_inode;
+       int last = 0;
+
+       spin_lock(&inode->i_lock);
+       dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+            ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+       BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+       if (--ci->i_nr_by_mode[fmode] == 0)
+               last++;
+       spin_unlock(&inode->i_lock);
+
+       if (last && ci->i_vino.snap == CEPH_NOSNAP)
+               ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+                             int mds, int drop, int unless, int force)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_cap *cap;
+       struct ceph_mds_request_release *rel = *p;
+       int ret = 0;
+
+       dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
+            mds, ceph_cap_string(drop), ceph_cap_string(unless));
+
+       spin_lock(&inode->i_lock);
+       cap = __get_cap_for_mds(ci, mds);
+       if (cap && __cap_is_valid(cap)) {
+               if (force ||
+                   ((cap->issued & drop) &&
+                    (cap->issued & unless) == 0)) {
+                       if ((cap->issued & drop) &&
+                           (cap->issued & unless) == 0) {
+                               dout("encode_inode_release %p cap %p %s -> "
+                                    "%s\n", inode, cap,
+                                    ceph_cap_string(cap->issued),
+                                    ceph_cap_string(cap->issued & ~drop));
+                               cap->issued &= ~drop;
+                               cap->implemented &= ~drop;
+                               if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+                                       int wanted = __ceph_caps_wanted(ci);
+                                       dout("  wanted %s -> %s (act %s)\n",
+                                            ceph_cap_string(cap->mds_wanted),
+                                            ceph_cap_string(cap->mds_wanted &
+                                                            ~wanted),
+                                            ceph_cap_string(wanted));
+                                       cap->mds_wanted &= wanted;
+                               }
+                       } else {
+                               dout("encode_inode_release %p cap %p %s"
+                                    " (force)\n", inode, cap,
+                                    ceph_cap_string(cap->issued));
+                       }
+
+                       rel->ino = cpu_to_le64(ceph_ino(inode));
+                       rel->cap_id = cpu_to_le64(cap->cap_id);
+                       rel->seq = cpu_to_le32(cap->seq);
+                       rel->issue_seq = cpu_to_le32(cap->issue_seq),
+                       rel->mseq = cpu_to_le32(cap->mseq);
+                       rel->caps = cpu_to_le32(cap->issued);
+                       rel->wanted = cpu_to_le32(cap->mds_wanted);
+                       rel->dname_len = 0;
+                       rel->dname_seq = 0;
+                       *p += sizeof(*rel);
+                       ret = 1;
+               } else {
+                       dout("encode_inode_release %p cap %p %s\n",
+                            inode, cap, ceph_cap_string(cap->issued));
+               }
+       }
+       spin_unlock(&inode->i_lock);
+       return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+                              int mds, int drop, int unless)
+{
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct ceph_mds_request_release *rel = *p;
+       struct ceph_dentry_info *di = ceph_dentry(dentry);
+       int force = 0;
+       int ret;
+
+       /*
+        * force an record for the directory caps if we have a dentry lease.
+        * this is racy (can't take i_lock and d_lock together), but it
+        * doesn't have to be perfect; the mds will revoke anything we don't
+        * release.
+        */
+       spin_lock(&dentry->d_lock);
+       if (di->lease_session && di->lease_session->s_mds == mds)
+               force = 1;
+       spin_unlock(&dentry->d_lock);
+
+       ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+       spin_lock(&dentry->d_lock);
+       if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+               dout("encode_dentry_release %p mds%d seq %d\n",
+                    dentry, mds, (int)di->lease_seq);
+               rel->dname_len = cpu_to_le32(dentry->d_name.len);
+               memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+               *p += dentry->d_name.len;
+               rel->dname_seq = cpu_to_le32(di->lease_seq);
+       }
+       spin_unlock(&dentry->d_lock);
+       return ret;
+}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644 (file)
index 0000000..1818c23
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)                                               \
+       pr_debug(" %12.12s:%-4d : " fmt,                                \
+                ceph_file_part(__FILE__, sizeof(__FILE__)),            \
+                __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)       do {                            \
+               if (0)                                          \
+                       printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
+       } while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...)        pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644 (file)
index 0000000..ab6cf35
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+       unsigned va = ceph_frag_value(a);
+       unsigned vb = ceph_frag_value(b);
+       if (va < vb)
+               return -1;
+       if (va > vb)
+               return 1;
+       va = ceph_frag_bits(a);
+       vb = ceph_frag_bits(b);
+       if (va < vb)
+               return -1;
+       if (va > vb)
+               return 1;
+       return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644 (file)
index 0000000..793f50c
--- /dev/null
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+       return (b << 24) |
+               (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+       return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+       return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+       return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+       return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+       return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+       /* is sub as specific as us, and contained by us? */
+       return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+              (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f) - 1,
+                        ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+       return ceph_frag_bits(f) > 0 &&
+               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+       return ceph_frag_bits(f) > 0 &&
+               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f),
+                     ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f)+1,
+             ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+       int newbits = ceph_frag_bits(f) + by;
+       return ceph_frag_make(newbits,
+                        ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+       return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+       return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f),
+                        ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644 (file)
index 0000000..79d76bc
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+       __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+       __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+       __u32 os = le32_to_cpu(layout->fl_object_size);
+
+       /* stripe unit, object size must be non-zero, 64k increment */
+       if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+               return 0;
+       if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+               return 0;
+       /* object size must be a multiple of stripe unit */
+       if (os < su || os % su)
+               return 0;
+       /* stripe count must be non-zero */
+       if (!sc)
+               return 0;
+       return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY  /* fixme */
+       if ((flags & O_DIRECTORY) == O_DIRECTORY)
+               return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+       if (flags & O_LAZY)
+               return CEPH_FILE_MODE_LAZY;
+#endif
+       if ((flags & O_APPEND) == O_APPEND)
+               flags |= O_WRONLY;
+
+       flags &= O_ACCMODE;
+       if ((flags & O_RDWR) == O_RDWR)
+               return CEPH_FILE_MODE_RDWR;
+       if ((flags & O_WRONLY) == O_WRONLY)
+               return CEPH_FILE_MODE_WR;
+       return CEPH_FILE_MODE_RD;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+       switch (mode) {
+       case CEPH_FILE_MODE_PIN:
+               return CEPH_CAP_PIN;
+       case CEPH_FILE_MODE_RD:
+               return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                       CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+       case CEPH_FILE_MODE_RDWR:
+               return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                       CEPH_CAP_FILE_EXCL |
+                       CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+                       CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                       CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                       CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+       case CEPH_FILE_MODE_WR:
+               return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                       CEPH_CAP_FILE_EXCL |
+                       CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                       CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                       CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+       }
+       return 0;
+}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644 (file)
index 0000000..0c2241e
--- /dev/null
@@ -0,0 +1,650 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_PATCH 0
+
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+       "." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+                                      CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_REQUIRED   0
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+       /* file -> object mapping */
+       __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+                                     of page size. */
+       __le32 fl_stripe_count;    /* over this many objects */
+       __le32 fl_object_size;     /* until objects are this big, then move to
+                                     new objects */
+       __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+       /* pg -> disk layout */
+       __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+
+       /* object -> pg layout */
+       __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+       __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN      0x0
+#define CEPH_AUTH_NONE         0x1
+#define CEPH_AUTH_CEPHX                0x2
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH                  17
+#define CEPH_MSG_AUTH_REPLY            18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+
+struct ceph_mon_request_header {
+       __le64 have_version;
+       __le16 session_mon;
+       __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+       __le64 kb, kb_used, kb_avail;
+       __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+       struct ceph_fsid fsid;
+       __le64 version;
+       struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+       __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+       struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+       __le64 have_version;    __le64 have;
+       __u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+       __le32 duration;         /* seconds */
+       struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+                                         empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+                                         operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+       CEPH_SESSION_REQUEST_OPEN,
+       CEPH_SESSION_OPEN,
+       CEPH_SESSION_REQUEST_CLOSE,
+       CEPH_SESSION_CLOSE,
+       CEPH_SESSION_REQUEST_RENEWCAPS,
+       CEPH_SESSION_RENEWCAPS,
+       CEPH_SESSION_STALE,
+       CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+       __le32 op;
+       __le64 seq;
+       struct ceph_timespec stamp;
+       __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+       CEPH_MDS_OP_LOOKUP     = 0x00100,
+       CEPH_MDS_OP_GETATTR    = 0x00101,
+       CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+       CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+       CEPH_MDS_OP_SETXATTR   = 0x01105,
+       CEPH_MDS_OP_RMXATTR    = 0x01106,
+       CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+       CEPH_MDS_OP_SETATTR    = 0x01108,
+
+       CEPH_MDS_OP_MKNOD      = 0x01201,
+       CEPH_MDS_OP_LINK       = 0x01202,
+       CEPH_MDS_OP_UNLINK     = 0x01203,
+       CEPH_MDS_OP_RENAME     = 0x01204,
+       CEPH_MDS_OP_MKDIR      = 0x01220,
+       CEPH_MDS_OP_RMDIR      = 0x01221,
+       CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+       CEPH_MDS_OP_CREATE     = 0x01301,
+       CEPH_MDS_OP_OPEN       = 0x00302,
+       CEPH_MDS_OP_READDIR    = 0x00305,
+
+       CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+       CEPH_MDS_OP_MKSNAP     = 0x01400,
+       CEPH_MDS_OP_RMSNAP     = 0x01401,
+       CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+       struct {
+               __le32 mask;                 /* CEPH_CAP_* */
+       } __attribute__ ((packed)) getattr;
+       struct {
+               __le32 mode;
+               __le32 uid;
+               __le32 gid;
+               struct ceph_timespec mtime;
+               struct ceph_timespec atime;
+               __le64 size, old_size;       /* old_size needed by truncate */
+               __le32 mask;                 /* CEPH_SETATTR_* */
+       } __attribute__ ((packed)) setattr;
+       struct {
+               __le32 frag;                 /* which dir fragment */
+               __le32 max_entries;          /* how many dentries to grab */
+       } __attribute__ ((packed)) readdir;
+       struct {
+               __le32 mode;
+               __le32 rdev;
+       } __attribute__ ((packed)) mknod;
+       struct {
+               __le32 mode;
+       } __attribute__ ((packed)) mkdir;
+       struct {
+               __le32 flags;
+               __le32 mode;
+               __le32 stripe_unit;          /* layout for newly created file */
+               __le32 stripe_count;         /* ... */
+               __le32 object_size;
+               __le32 file_replication;
+               __le32 preferred;
+       } __attribute__ ((packed)) open;
+       struct {
+               __le32 flags;
+       } __attribute__ ((packed)) setxattr;
+       struct {
+               struct ceph_file_layout layout;
+       } __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head {
+       __le64 oldest_client_tid;
+       __le32 mdsmap_epoch;           /* on client */
+       __le32 flags;                  /* CEPH_MDS_FLAG_* */
+       __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+       __le16 num_releases;           /* # include cap/lease release records */
+       __le32 op;                     /* mds op code */
+       __le32 caller_uid, caller_gid;
+       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
+                                         etc. (if replaying) */
+       union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+       __le64 ino, cap_id;            /* ino and unique cap id */
+       __le32 caps, wanted;           /* new issued, wanted */
+       __le32 seq, issue_seq, mseq;
+       __le32 dname_seq;              /* if releasing a dentry lease, a */
+       __le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+       __le32 op;
+       __le32 result;
+       __le32 mdsmap_epoch;
+       __u8 safe;                     /* true if committed to disk */
+       __u8 is_dentry, is_target;     /* true if dentry, target inode records
+                                         are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+       __le32 frag;                   /* this frag splits... */
+       __le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+       __le32 nsplits;                /* num ceph_frag_tree_split records */
+       struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+       __le32 caps, wanted;           /* caps issued, wanted */
+       __le64 cap_id;
+       __le32 seq, mseq;
+       __le64 realm;                  /* snap realm */
+       __u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+       __le64 ino;
+       __le64 snapid;
+       __le32 rdev;
+       __le64 version;                /* inode version */
+       __le64 xattr_version;          /* version for xattr blob */
+       struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+       struct ceph_file_layout layout;
+       struct ceph_timespec ctime, mtime, atime;
+       __le32 time_warp_seq;
+       __le64 size, max_size, truncate_size;
+       __le32 truncate_seq;
+       __le32 mode, uid, gid;
+       __le32 nlink;
+       __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+       struct ceph_timespec rctime;
+       struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+       __le16 mask;            /* lease type(s) */
+       __le32 duration_ms;     /* lease duration */
+       __le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+       __le32 frag;            /* fragment */
+       __le32 auth;            /* auth mds, if this is a delegation point */
+       __le32 ndist;           /* number of mds' this is replicated on */
+       __le32 dist[];
+} __attribute__ ((packed));
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+
+#define CEPH_CAP_BITS       16
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                        \
+                                CEPH_CAP_AUTH_SHARED | \
+                                CEPH_CAP_LINK_SHARED | \
+                                CEPH_CAP_FILE_SHARED | \
+                                CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                    \
+                             CEPH_CAP_LINK_SHARED |                    \
+                             CEPH_CAP_XATTR_SHARED |                   \
+                             CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |    \
+                          CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |                \
+                          CEPH_CAP_LINK_EXCL |         \
+                          CEPH_CAP_XATTR_EXCL |        \
+                          CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |        \
+                             CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+                          CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+                       CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+       CEPH_CAP_OP_GRANT,         /* mds->client grant */
+       CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+       CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+       CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+       CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+       CEPH_CAP_OP_UPDATE,        /* client->mds update */
+       CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+       CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+       CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+       CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+       CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+       CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+       CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+       __le32 op;                  /* CEPH_CAP_OP_* */
+       __le64 ino, realm;
+       __le64 cap_id;
+       __le32 seq, issue_seq;
+       __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+       __le32 migrate_seq;
+       __le64 snap_follows;
+       __le32 snap_trace_len;
+
+       /* authlock */
+       __le32 uid, gid, mode;
+
+       /* linklock */
+       __le32 nlink;
+
+       /* xattrlock */
+       __le32 xattr_len;
+       __le64 xattr_version;
+
+       /* filelock */
+       __le64 size, max_size, truncate_size;
+       __le32 truncate_seq;
+       struct ceph_timespec mtime, atime, ctime;
+       struct ceph_file_layout layout;
+       __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+       __le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+       __le64 ino;
+       __le64 cap_id;
+       __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+       __u8 action;            /* CEPH_MDS_LEASE_* */
+       __le16 mask;            /* which lease */
+       __le64 ino;
+       __le64 first, last;     /* snap range */
+       __le32 seq;
+       __le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+       __le64 cap_id;
+       __le32 wanted;
+       __le32 issued;
+       __le64 size;
+       struct ceph_timespec mtime, atime;
+       __le64 snaprealm;
+       __le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+
+struct ceph_mds_snaprealm_reconnect {
+       __le64 ino;     /* snap realm base */
+       __le64 seq;     /* snap seq for this snap realm */
+       __le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+       CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+       CEPH_SNAP_OP_CREATE,
+       CEPH_SNAP_OP_DESTROY,
+       CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+       __le32 op;                /* CEPH_SNAP_OP_* */
+       __le64 split;             /* ino to split off, if any */
+       __le32 num_split_inos;    /* # inos belonging to new child realm */
+       __le32 num_split_realms;  /* # child realms udner new child realm */
+       __le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+       __le64 ino;           /* ino */
+       __le64 created;       /* snap: when created */
+       __le64 parent;        /* ino: parent realm */
+       __le64 parent_since;  /* snap: same parent since */
+       __le64 seq;           /* snap: version */
+       __le32 num_snaps;
+       __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644 (file)
index 0000000..bd57001
--- /dev/null
@@ -0,0 +1,118 @@
+
+#include "types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)                                           \
+       do {                                                    \
+               a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
+               b = b - c;  b = b - a;  b = b ^ (a << 8);       \
+               c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
+               a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
+               b = b - c;  b = b - a;  b = b ^ (a << 16);      \
+               c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
+               a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
+               b = b - c;  b = b - a;  b = b ^ (a << 10);      \
+               c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
+       } while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+       const unsigned char *k = (const unsigned char *)str;
+       __u32 a, b, c;  /* the internal state */
+       __u32 len;      /* how many key bytes still need mixing */
+
+       /* Set up the internal state */
+       len = length;
+       a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+       b = a;
+       c = 0;               /* variable initialization of internal state */
+
+       /* handle most of the key */
+       while (len >= 12) {
+               a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+                        ((__u32)k[3] << 24));
+               b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+                        ((__u32)k[7] << 24));
+               c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+                        ((__u32)k[11] << 24));
+               mix(a, b, c);
+               k = k + 12;
+               len = len - 12;
+       }
+
+       /* handle the last 11 bytes */
+       c = c + length;
+       switch (len) {            /* all the case statements fall through */
+       case 11:
+               c = c + ((__u32)k[10] << 24);
+       case 10:
+               c = c + ((__u32)k[9] << 16);
+       case 9:
+               c = c + ((__u32)k[8] << 8);
+               /* the first byte of c is reserved for the length */
+       case 8:
+               b = b + ((__u32)k[7] << 24);
+       case 7:
+               b = b + ((__u32)k[6] << 16);
+       case 6:
+               b = b + ((__u32)k[5] << 8);
+       case 5:
+               b = b + k[4];
+       case 4:
+               a = a + ((__u32)k[3] << 24);
+       case 3:
+               a = a + ((__u32)k[2] << 16);
+       case 2:
+               a = a + ((__u32)k[1] << 8);
+       case 1:
+               a = a + k[0];
+               /* case 0: nothing left to add */
+       }
+       mix(a, b, c);
+
+       return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+       unsigned long hash = 0;
+       unsigned char c;
+
+       while (length--) {
+               c = *str++;
+               hash = (hash + (c << 4) + (c >> 4)) * 11;
+       }
+       return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+       switch (type) {
+       case CEPH_STR_HASH_LINUX:
+               return ceph_str_hash_linux(s, len);
+       case CEPH_STR_HASH_RJENKINS:
+               return ceph_str_hash_rjenkins(s, len);
+       default:
+               return -1;
+       }
+}
+
+const char *ceph_str_hash_name(int type)
+{
+       switch (type) {
+       case CEPH_STR_HASH_LINUX:
+               return "linux";
+       case CEPH_STR_HASH_RJENKINS:
+               return "rjenkins";
+       default:
+               return "unknown";
+       }
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644 (file)
index 0000000..5ac470c
--- /dev/null
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644 (file)
index 0000000..8e4be6a
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+
+const char *ceph_entity_type_name(int type)
+{
+       switch (type) {
+       case CEPH_ENTITY_TYPE_MDS: return "mds";
+       case CEPH_ENTITY_TYPE_OSD: return "osd";
+       case CEPH_ENTITY_TYPE_MON: return "mon";
+       case CEPH_ENTITY_TYPE_CLIENT: return "client";
+       case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+       case CEPH_ENTITY_TYPE_AUTH: return "auth";
+       default: return "unknown";
+       }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+       switch (op) {
+       case CEPH_OSD_OP_READ: return "read";
+       case CEPH_OSD_OP_STAT: return "stat";
+
+       case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+       case CEPH_OSD_OP_WRITE: return "write";
+       case CEPH_OSD_OP_DELETE: return "delete";
+       case CEPH_OSD_OP_TRUNCATE: return "truncate";
+       case CEPH_OSD_OP_ZERO: return "zero";
+       case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+       case CEPH_OSD_OP_APPEND: return "append";
+       case CEPH_OSD_OP_STARTSYNC: return "startsync";
+       case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+       case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+       case CEPH_OSD_OP_TMAPUP: return "tmapup";
+       case CEPH_OSD_OP_TMAPGET: return "tmapget";
+       case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+       case CEPH_OSD_OP_GETXATTR: return "getxattr";
+       case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+       case CEPH_OSD_OP_SETXATTR: return "setxattr";
+       case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+       case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+       case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+
+       case CEPH_OSD_OP_PULL: return "pull";
+       case CEPH_OSD_OP_PUSH: return "push";
+       case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+       case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+       case CEPH_OSD_OP_SCRUB: return "scrub";
+
+       case CEPH_OSD_OP_WRLOCK: return "wrlock";
+       case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+       case CEPH_OSD_OP_RDLOCK: return "rdlock";
+       case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+       case CEPH_OSD_OP_UPLOCK: return "uplock";
+       case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+       case CEPH_OSD_OP_CALL: return "call";
+
+       case CEPH_OSD_OP_PGLS: return "pgls";
+       }
+       return "???";
+}
+
+const char *ceph_mds_state_name(int s)
+{
+       switch (s) {
+               /* down and out */
+       case CEPH_MDS_STATE_DNE:        return "down:dne";
+       case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+               /* up and out */
+       case CEPH_MDS_STATE_BOOT:       return "up:boot";
+       case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+       case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+       case CEPH_MDS_STATE_CREATING:   return "up:creating";
+       case CEPH_MDS_STATE_STARTING:   return "up:starting";
+               /* up and in */
+       case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+       case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+       case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+       case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+       case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+       case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+       case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+       }
+       return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+       switch (op) {
+       case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+       case CEPH_SESSION_OPEN: return "open";
+       case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+       case CEPH_SESSION_CLOSE: return "close";
+       case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+       case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+       case CEPH_SESSION_STALE: return "stale";
+       case CEPH_SESSION_RECALL_STATE: return "recall_state";
+       }
+       return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+       switch (op) {
+       case CEPH_MDS_OP_LOOKUP:  return "lookup";
+       case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+       case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+       case CEPH_MDS_OP_GETATTR:  return "getattr";
+       case CEPH_MDS_OP_SETXATTR: return "setxattr";
+       case CEPH_MDS_OP_SETATTR: return "setattr";
+       case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+       case CEPH_MDS_OP_READDIR: return "readdir";
+       case CEPH_MDS_OP_MKNOD: return "mknod";
+       case CEPH_MDS_OP_LINK: return "link";
+       case CEPH_MDS_OP_UNLINK: return "unlink";
+       case CEPH_MDS_OP_RENAME: return "rename";
+       case CEPH_MDS_OP_MKDIR: return "mkdir";
+       case CEPH_MDS_OP_RMDIR: return "rmdir";
+       case CEPH_MDS_OP_SYMLINK: return "symlink";
+       case CEPH_MDS_OP_CREATE: return "create";
+       case CEPH_MDS_OP_OPEN: return "open";
+       case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+       case CEPH_MDS_OP_LSSNAP: return "lssnap";
+       case CEPH_MDS_OP_MKSNAP: return "mksnap";
+       case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+       }
+       return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+       switch (op) {
+       case CEPH_CAP_OP_GRANT: return "grant";
+       case CEPH_CAP_OP_REVOKE: return "revoke";
+       case CEPH_CAP_OP_TRUNC: return "trunc";
+       case CEPH_CAP_OP_EXPORT: return "export";
+       case CEPH_CAP_OP_IMPORT: return "import";
+       case CEPH_CAP_OP_UPDATE: return "update";
+       case CEPH_CAP_OP_DROP: return "drop";
+       case CEPH_CAP_OP_FLUSH: return "flush";
+       case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+       case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+       case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+       case CEPH_CAP_OP_RELEASE: return "release";
+       case CEPH_CAP_OP_RENEW: return "renew";
+       }
+       return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+       switch (o) {
+       case CEPH_MDS_LEASE_REVOKE: return "revoke";
+       case CEPH_MDS_LEASE_RELEASE: return "release";
+       case CEPH_MDS_LEASE_RENEW: return "renew";
+       case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+       }
+       return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+       switch (o) {
+       case CEPH_SNAP_OP_UPDATE: return "update";
+       case CEPH_SNAP_OP_CREATE: return "create";
+       case CEPH_SNAP_OP_DESTROY: return "destroy";
+       case CEPH_SNAP_OP_SPLIT: return "split";
+       }
+       return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644 (file)
index 0000000..fabd302
--- /dev/null
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include "crush.h"
+
+const char *crush_bucket_alg_name(int alg)
+{
+       switch (alg) {
+       case CRUSH_BUCKET_UNIFORM: return "uniform";
+       case CRUSH_BUCKET_LIST: return "list";
+       case CRUSH_BUCKET_TREE: return "tree";
+       case CRUSH_BUCKET_STRAW: return "straw";
+       default: return "unknown";
+       }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+       if (p >= b->size)
+               return 0;
+
+       switch (b->alg) {
+       case CRUSH_BUCKET_UNIFORM:
+               return ((struct crush_bucket_uniform *)b)->item_weight;
+       case CRUSH_BUCKET_LIST:
+               return ((struct crush_bucket_list *)b)->item_weights[p];
+       case CRUSH_BUCKET_TREE:
+               if (p & 1)
+                       return ((struct crush_bucket_tree *)b)->node_weights[p];
+               return 0;
+       case CRUSH_BUCKET_STRAW:
+               return ((struct crush_bucket_straw *)b)->item_weights[p];
+       }
+       return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+       int i, b, c;
+
+       for (b = 0; b < map->max_buckets; b++) {
+               if (map->buckets[b] == NULL)
+                       continue;
+               for (i = 0; i < map->buckets[b]->size; i++) {
+                       c = map->buckets[b]->items[i];
+                       BUG_ON(c >= map->max_devices ||
+                              c < -map->max_buckets);
+                       if (c >= 0)
+                               map->device_parents[c] = map->buckets[b]->id;
+                       else
+                               map->bucket_parents[-1-c] = map->buckets[b]->id;
+               }
+       }
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+       kfree(b->h.perm);
+       kfree(b->h.items);
+       kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+       kfree(b->item_weights);
+       kfree(b->sum_weights);
+       kfree(b->h.perm);
+       kfree(b->h.items);
+       kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+       kfree(b->node_weights);
+       kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+       kfree(b->straws);
+       kfree(b->item_weights);
+       kfree(b->h.perm);
+       kfree(b->h.items);
+       kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+       switch (b->alg) {
+       case CRUSH_BUCKET_UNIFORM:
+               crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+               break;
+       case CRUSH_BUCKET_LIST:
+               crush_destroy_bucket_list((struct crush_bucket_list *)b);
+               break;
+       case CRUSH_BUCKET_TREE:
+               crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+               break;
+       case CRUSH_BUCKET_STRAW:
+               crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+               break;
+       }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+       int b;
+
+       /* buckets */
+       if (map->buckets) {
+               for (b = 0; b < map->max_buckets; b++) {
+                       if (map->buckets[b] == NULL)
+                               continue;
+                       crush_destroy_bucket(map->buckets[b]);
+               }
+               kfree(map->buckets);
+       }
+
+       /* rules */
+       if (map->rules) {
+               for (b = 0; b < map->max_rules; b++)
+                       kfree(map->rules[b]);
+               kfree(map->rules);
+       }
+
+       kfree(map->bucket_parents);
+       kfree(map->device_parents);
+       kfree(map);
+}
+
+
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644 (file)
index 0000000..dcd7e75
--- /dev/null
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+       __u32 op;
+       __s32 arg1;
+       __s32 arg2;
+};
+
+/* step op codes */
+enum {
+       CRUSH_RULE_NOOP = 0,
+       CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+       CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+                                     /* arg2 = type */
+       CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+       CRUSH_RULE_EMIT = 4,          /* no args */
+       CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+       CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+       __u8 ruleset;
+       __u8 type;
+       __u8 min_size;
+       __u8 max_size;
+};
+
+struct crush_rule {
+       __u32 len;
+       struct crush_rule_mask mask;
+       struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+                             (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+       CRUSH_BUCKET_UNIFORM = 1,
+       CRUSH_BUCKET_LIST = 2,
+       CRUSH_BUCKET_TREE = 3,
+       CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+       __s32 id;        /* this'll be negative */
+       __u16 type;      /* non-zero; type=0 is reserved for devices */
+       __u8 alg;        /* one of CRUSH_BUCKET_* */
+       __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+       __u32 weight;    /* 16-bit fixed point */
+       __u32 size;      /* num items */
+       __s32 *items;
+
+       /*
+        * cached random permutation: used for uniform bucket and for
+        * the linear search fallback for the other bucket types.
+        */
+       __u32 perm_x;  /* @x for which *perm is defined */
+       __u32 perm_n;  /* num elements of *perm that are permuted/defined */
+       __u32 *perm;
+};
+
+struct crush_bucket_uniform {
+       struct crush_bucket h;
+       __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+       struct crush_bucket h;
+       __u32 *item_weights;  /* 16-bit fixed point */
+       __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+                                of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+       struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+                                  actual items */
+       __u8 num_nodes;
+       __u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+       struct crush_bucket h;
+       __u32 *item_weights;   /* 16-bit fixed point */
+       __u32 *straws;         /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+       struct crush_bucket **buckets;
+       struct crush_rule **rules;
+
+       /*
+        * Parent pointers to identify the parent bucket a device or
+        * bucket in the hierarchy.  If an item appears more than
+        * once, this is the _last_ time it appeared (where buckets
+        * are processed in bucket id order, from -1 on down to
+        * -max_buckets.
+        */
+       __u32 *bucket_parents;
+       __u32 *device_parents;
+
+       __s32 max_buckets;
+       __u32 max_rules;
+       __s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644 (file)
index 0000000..5873aed
--- /dev/null
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include "hash.h"
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {                    \
+               a = a-b;  a = a-c;  a = a^(c>>13);      \
+               b = b-c;  b = b-a;  b = b^(a<<8);       \
+               c = c-a;  c = c-b;  c = c^(b>>13);      \
+               a = a-b;  a = a-c;  a = a^(c>>12);      \
+               b = b-c;  b = b-a;  b = b^(a<<16);      \
+               c = c-a;  c = c-b;  c = c^(b>>5);       \
+               a = a-b;  a = a-c;  a = a^(c>>3);       \
+               b = b-c;  b = b-a;  b = b^(a<<10);      \
+               c = c-a;  c = c-b;  c = c^(b>>15);      \
+       } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+       __u32 hash = crush_hash_seed ^ a;
+       __u32 b = a;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(b, x, hash);
+       crush_hashmix(y, a, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(x, a, hash);
+       crush_hashmix(b, y, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(c, x, hash);
+       crush_hashmix(y, a, hash);
+       crush_hashmix(b, x, hash);
+       crush_hashmix(y, c, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(c, d, hash);
+       crush_hashmix(a, x, hash);
+       crush_hashmix(y, b, hash);
+       crush_hashmix(c, x, hash);
+       crush_hashmix(y, d, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+                                     __u32 e)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(c, d, hash);
+       crush_hashmix(e, x, hash);
+       crush_hashmix(y, a, hash);
+       crush_hashmix(b, x, hash);
+       crush_hashmix(y, c, hash);
+       crush_hashmix(d, x, hash);
+       crush_hashmix(y, e, hash);
+       return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1(a);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_2(a, b);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_3(a, b, c);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_4(a, b, c, d);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_5(a, b, c, d, e);
+       default:
+               return 0;
+       }
+}
+
+const char *crush_hash_name(int type)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return "rjenkins1";
+       default:
+               return "unknown";
+       }
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644 (file)
index 0000000..ff48e11
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+                           __u32 e);
+
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644 (file)
index 0000000..9ba54ef
--- /dev/null
@@ -0,0 +1,596 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+       int i;
+
+       for (i = 0; i < map->max_rules; i++) {
+               if (map->rules[i] &&
+                   map->rules[i]->mask.ruleset == ruleset &&
+                   map->rules[i]->mask.type == type &&
+                   map->rules[i]->mask.min_size <= size &&
+                   map->rules[i]->mask.max_size >= size)
+                       return i;
+       }
+       return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+                             int x, int r)
+{
+       unsigned pr = r % bucket->size;
+       unsigned i, s;
+
+       /* start a new permutation if @x has changed */
+       if (bucket->perm_x != x || bucket->perm_n == 0) {
+               dprintk("bucket %d new x=%d\n", bucket->id, x);
+               bucket->perm_x = x;
+
+               /* optimize common r=0 case */
+               if (pr == 0) {
+                       s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+                               bucket->size;
+                       bucket->perm[0] = s;
+                       bucket->perm_n = 0xffff;   /* magic value, see below */
+                       goto out;
+               }
+
+               for (i = 0; i < bucket->size; i++)
+                       bucket->perm[i] = i;
+               bucket->perm_n = 0;
+       } else if (bucket->perm_n == 0xffff) {
+               /* clean up after the r=0 case above */
+               for (i = 1; i < bucket->size; i++)
+                       bucket->perm[i] = i;
+               bucket->perm[bucket->perm[0]] = 0;
+               bucket->perm_n = 1;
+       }
+
+       /* calculate permutation up to pr */
+       for (i = 0; i < bucket->perm_n; i++)
+               dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+       while (bucket->perm_n <= pr) {
+               unsigned p = bucket->perm_n;
+               /* no point in swapping the final entry */
+               if (p < bucket->size - 1) {
+                       i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+                               (bucket->size - p);
+                       if (i) {
+                               unsigned t = bucket->perm[p + i];
+                               bucket->perm[p + i] = bucket->perm[p];
+                               bucket->perm[p] = t;
+                       }
+                       dprintk(" perm_choose swap %d with %d\n", p, p+i);
+               }
+               bucket->perm_n++;
+       }
+       for (i = 0; i < bucket->size; i++)
+               dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+       s = bucket->perm[pr];
+out:
+       dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+               bucket->size, x, r, pr, s);
+       return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+                                int x, int r)
+{
+       return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+                             int x, int r)
+{
+       int i;
+
+       for (i = bucket->h.size-1; i >= 0; i--) {
+               __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+                                        r, bucket->h.id);
+               w &= 0xffff;
+               dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+                       "sw %x rand %llx",
+                       i, x, r, bucket->h.items[i], bucket->item_weights[i],
+                       bucket->sum_weights[i], w);
+               w *= bucket->sum_weights[i];
+               w = w >> 16;
+               /*dprintk(" scaled %llx\n", w);*/
+               if (w < bucket->item_weights[i])
+                       return bucket->h.items[i];
+       }
+
+       BUG_ON(1);
+       return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+       int h = 0;
+       while ((n & 1) == 0) {
+               h++;
+               n = n >> 1;
+       }
+       return h;
+}
+
+static int left(int x)
+{
+       int h = height(x);
+       return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+       int h = height(x);
+       return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+       return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+                             int x, int r)
+{
+       int n, l;
+       __u32 w;
+       __u64 t;
+
+       /* start at root */
+       n = bucket->num_nodes >> 1;
+
+       while (!terminal(n)) {
+               /* pick point in [0, w) */
+               w = bucket->node_weights[n];
+               t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+                                         bucket->h.id) * (__u64)w;
+               t = t >> 32;
+
+               /* descend to the left or right? */
+               l = left(n);
+               if (t < bucket->node_weights[l])
+                       n = l;
+               else
+                       n = right(n);
+       }
+
+       return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+                              int x, int r)
+{
+       int i;
+       int high = 0;
+       __u64 high_draw = 0;
+       __u64 draw;
+
+       for (i = 0; i < bucket->h.size; i++) {
+               draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+               draw &= 0xffff;
+               draw *= bucket->straws[i];
+               if (i == 0 || draw > high_draw) {
+                       high = i;
+                       high_draw = draw;
+               }
+       }
+       return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+       dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+       switch (in->alg) {
+       case CRUSH_BUCKET_UNIFORM:
+               return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+                                         x, r);
+       case CRUSH_BUCKET_LIST:
+               return bucket_list_choose((struct crush_bucket_list *)in,
+                                         x, r);
+       case CRUSH_BUCKET_TREE:
+               return bucket_tree_choose((struct crush_bucket_tree *)in,
+                                         x, r);
+       case CRUSH_BUCKET_STRAW:
+               return bucket_straw_choose((struct crush_bucket_straw *)in,
+                                          x, r);
+       default:
+               BUG_ON(1);
+               return in->items[0];
+       }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+       if (weight[item] >= 0x1000)
+               return 0;
+       if (weight[item] == 0)
+               return 1;
+       if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+           < weight[item])
+               return 0;
+       return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+                       struct crush_bucket *bucket,
+                       __u32 *weight,
+                       int x, int numrep, int type,
+                       int *out, int outpos,
+                       int firstn, int recurse_to_leaf,
+                       int *out2)
+{
+       int rep;
+       int ftotal, flocal;
+       int retry_descent, retry_bucket, skip_rep;
+       struct crush_bucket *in = bucket;
+       int r;
+       int i;
+       int item = 0;
+       int itemtype;
+       int collide, reject;
+       const int orig_tries = 5; /* attempts before we fall back to search */
+       dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+       for (rep = outpos; rep < numrep; rep++) {
+               /* keep trying until we get a non-out, non-colliding item */
+               ftotal = 0;
+               skip_rep = 0;
+               do {
+                       retry_descent = 0;
+                       in = bucket;               /* initial bucket */
+
+                       /* choose through intervening buckets */
+                       flocal = 0;
+                       do {
+                               collide = 0;
+                               retry_bucket = 0;
+                               r = rep;
+                               if (in->alg == CRUSH_BUCKET_UNIFORM) {
+                                       /* be careful */
+                                       if (firstn || numrep >= in->size)
+                                               /* r' = r + f_total */
+                                               r += ftotal;
+                                       else if (in->size % numrep == 0)
+                                               /* r'=r+(n+1)*f_local */
+                                               r += (numrep+1) *
+                                                       (flocal+ftotal);
+                                       else
+                                               /* r' = r + n*f_local */
+                                               r += numrep * (flocal+ftotal);
+                               } else {
+                                       if (firstn)
+                                               /* r' = r + f_total */
+                                               r += ftotal;
+                                       else
+                                               /* r' = r + n*f_local */
+                                               r += numrep * (flocal+ftotal);
+                               }
+
+                               /* bucket choose */
+                               if (in->size == 0) {
+                                       reject = 1;
+                                       goto reject;
+                               }
+                               if (flocal >= (in->size>>1) &&
+                                   flocal > orig_tries)
+                                       item = bucket_perm_choose(in, x, r);
+                               else
+                                       item = crush_bucket_choose(in, x, r);
+                               BUG_ON(item >= map->max_devices);
+
+                               /* desired type? */
+                               if (item < 0)
+                                       itemtype = map->buckets[-1-item]->type;
+                               else
+                                       itemtype = 0;
+                               dprintk("  item %d type %d\n", item, itemtype);
+
+                               /* keep going? */
+                               if (itemtype != type) {
+                                       BUG_ON(item >= 0 ||
+                                              (-1-item) >= map->max_buckets);
+                                       in = map->buckets[-1-item];
+                                       continue;
+                               }
+
+                               /* collision? */
+                               for (i = 0; i < outpos; i++) {
+                                       if (out[i] == item) {
+                                               collide = 1;
+                                               break;
+                                       }
+                               }
+
+                               if (recurse_to_leaf &&
+                                   item < 0 &&
+                                   crush_choose(map, map->buckets[-1-item],
+                                                weight,
+                                                x, outpos+1, 0,
+                                                out2, outpos,
+                                                firstn, 0, NULL) <= outpos) {
+                                       reject = 1;
+                               } else {
+                                       /* out? */
+                                       if (itemtype == 0)
+                                               reject = is_out(map, weight,
+                                                               item, x);
+                                       else
+                                               reject = 0;
+                               }
+
+reject:
+                               if (reject || collide) {
+                                       ftotal++;
+                                       flocal++;
+
+                                       if (collide && flocal < 3)
+                                               /* retry locally a few times */
+                                               retry_bucket = 1;
+                                       else if (flocal < in->size + orig_tries)
+                                               /* exhaustive bucket search */
+                                               retry_bucket = 1;
+                                       else if (ftotal < 20)
+                                               /* then retry descent */
+                                               retry_descent = 1;
+                                       else
+                                               /* else give up */
+                                               skip_rep = 1;
+                                       dprintk("  reject %d  collide %d  "
+                                       &nbs