ceph: fix flush_dirty_caps race with caps migration
Sage Weil [Mon, 1 Mar 2010 23:16:56 +0000 (15:16 -0800)]
The flush_dirty_caps() used to loop over the first entry of the cap_dirty
dirty list on the assumption that after calling ceph_check_caps() it would
be removed from the list.  This isn't true for caps that are being
migrated between MDSs, where we've received the EXPORT but not the IMPORT.

Instead, do a safe list iteration, and pin the next inode on the list via
the CEPH_I_NOFLUSH flag.

Signed-off-by: Sage Weil <sage@newdream.net>

fs/ceph/caps.c
fs/ceph/super.h

index 295b7e5..8b89b91 100644 (file)
@@ -1573,6 +1573,11 @@ retry_locked:
                }
 
 ack:
+               if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+                       dout(" skipping %p I_NOFLUSH set\n", inode);
+                       continue;
+               }
+
                if (session && session != cap->session) {
                        dout("oops, wrong session %p mutex\n", session);
                        mutex_unlock(&session->s_mutex);
@@ -1652,6 +1657,10 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 
 retry:
        spin_lock(&inode->i_lock);
+       if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+               dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+               goto out;
+       }
        if (ci->i_dirty_caps && ci->i_auth_cap) {
                struct ceph_cap *cap = ci->i_auth_cap;
                int used = __ceph_caps_used(ci);
@@ -2747,16 +2756,38 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
  */
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-       struct ceph_inode_info *ci;
-       struct inode *inode;
+       struct ceph_inode_info *ci, *nci = NULL;
+       struct inode *inode, *ninode = NULL;
+       struct list_head *p, *n;
 
        dout("flush_dirty_caps\n");
        spin_lock(&mdsc->cap_dirty_lock);
-       while (!list_empty(&mdsc->cap_dirty)) {
-               ci = list_first_entry(&mdsc->cap_dirty,
-                                     struct ceph_inode_info,
-                                     i_dirty_item);
-               inode = igrab(&ci->vfs_inode);
+       list_for_each_safe(p, n, &mdsc->cap_dirty) {
+               if (nci) {
+                       ci = nci;
+                       inode = ninode;
+                       ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+                       dout("flush_dirty_caps inode %p (was next inode)\n",
+                            inode);
+               } else {
+                       ci = list_entry(p, struct ceph_inode_info,
+                                       i_dirty_item);
+                       inode = igrab(&ci->vfs_inode);
+                       BUG_ON(!inode);
+                       dout("flush_dirty_caps inode %p\n", inode);
+               }
+               if (n != &mdsc->cap_dirty) {
+                       nci = list_entry(n, struct ceph_inode_info,
+                                        i_dirty_item);
+                       ninode = igrab(&nci->vfs_inode);
+                       BUG_ON(!ninode);
+                       nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+                       dout("flush_dirty_caps next inode %p, noflush\n",
+                            ninode);
+               } else {
+                       nci = NULL;
+                       ninode = NULL;
+               }
                spin_unlock(&mdsc->cap_dirty_lock);
                if (inode) {
                        ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
index ff7aaa3..6a778f2 100644 (file)
@@ -289,6 +289,7 @@ struct ceph_inode_xattrs_info {
 #define CEPH_I_COMPLETE  1  /* we have complete directory cached */
 #define CEPH_I_NODELAY   4  /* do not delay cap release */
 #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 
 struct ceph_inode_info {
        struct ceph_vino i_vino;   /* ceph ino + snap */