Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal
Linus Torvalds [Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)]
Pull third pile of signal handling patches from Al Viro:
 "This time it's mostly helpers and conversions to them; there's a lot
  of stuff remaining in the tree, but that'll either go in -rc2
  (isolated bug fixes, ideally via arch maintainers' trees) or will sit
  there until the next cycle."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal:
  x86: get rid of calling do_notify_resume() when returning to kernel mode
  blackfin: check __get_user() return value
  whack-a-mole with TIF_FREEZE
  FRV: Optimise the system call exit path in entry.S [ver #2]
  FRV: Shrink TIF_WORK_MASK [ver #2]
  FRV: Prevent syscall exit tracing and notify_resume at end of kernel exceptions
  new helper: signal_delivered()
  powerpc: get rid of restore_sigmask()
  most of set_current_blocked() callers want SIGKILL/SIGSTOP removed from set
  set_restore_sigmask() is never called without SIGPENDING (and never should be)
  TIF_RESTORE_SIGMASK can be set only when TIF_SIGPENDING is set
  don't call try_to_freeze() from do_signal()
  pull clearing RESTORE_SIGMASK into block_sigmask()
  sh64: failure to build sigframe != signal without handler
  openrisc: tracehook_signal_handler() is supposed to be called on success
  new helper: sigmask_to_save()
  new helper: restore_saved_sigmask()
  new helpers: {clear,test,test_and_clear}_restore_sigmask()
  HAVE_RESTORE_SIGMASK is defined on all architectures now

228 files changed:
Documentation/filesystems/Locking
Documentation/filesystems/vfs.txt
arch/alpha/include/asm/posix_types.h
arch/arm/include/asm/posix_types.h
arch/avr32/include/asm/posix_types.h
arch/blackfin/include/asm/posix_types.h
arch/cris/include/asm/posix_types.h
arch/frv/include/asm/posix_types.h
arch/h8300/include/asm/posix_types.h
arch/ia64/include/asm/posix_types.h
arch/ia64/kernel/perfmon.c
arch/ia64/kernel/sys_ia64.c
arch/m32r/include/asm/posix_types.h
arch/m68k/include/asm/posix_types.h
arch/mips/include/asm/posix_types.h
arch/mips/include/asm/stat.h
arch/mn10300/include/asm/posix_types.h
arch/parisc/include/asm/posix_types.h
arch/parisc/include/asm/stat.h
arch/powerpc/include/asm/posix_types.h
arch/powerpc/include/asm/stat.h
arch/s390/include/asm/posix_types.h
arch/sh/include/asm/posix_types_32.h
arch/sh/include/asm/posix_types_64.h
arch/sparc/include/asm/posix_types.h
arch/sparc/kernel/sys_sparc_64.c
arch/tile/include/asm/compat.h
arch/x86/include/asm/posix_types_32.h
drivers/base/soc.c
drivers/gpu/drm/i810/i810_dma.c
fs/9p/vfs_inode_dotl.c
fs/affs/affs.h
fs/aio.c
fs/attr.c
fs/binfmt_elf.c
fs/binfmt_flat.c
fs/btrfs/acl.c
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/print-tree.c
fs/btrfs/reada.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/ulist.c
fs/btrfs/ulist.h
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/buffer.c
fs/ceph/export.c
fs/compat.c
fs/dcache.c
fs/ecryptfs/inode.c
fs/exec.c
fs/exportfs/expfs.c
fs/ext4/Kconfig
fs/ext4/balloc.c
fs/ext4/bitmap.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mmp.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/ext4/xattr.h
fs/fat/inode.c
fs/fcntl.c
fs/file_table.c
fs/fuse/file.c
fs/fuse/inode.c
fs/gfs2/export.c
fs/hpfs/alloc.c
fs/hpfs/anode.c
fs/hpfs/dir.c
fs/hpfs/dnode.c
fs/hpfs/ea.c
fs/hpfs/hpfs.h
fs/hpfs/hpfs_fn.h
fs/hpfs/inode.c
fs/hpfs/map.c
fs/hpfs/namei.c
fs/hpfs/super.c
fs/inode.c
fs/internal.h
fs/isofs/export.c
fs/jbd2/Kconfig
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/recovery.c
fs/jbd2/revoke.c
fs/jbd2/transaction.c
fs/jffs2/jffs2_fs_sb.h
fs/jffs2/os-linux.h
fs/jffs2/super.c
fs/jffs2/wbuf.c
fs/lockd/svc.c
fs/locks.c
fs/namei.c
fs/namespace.c
fs/ncpfs/file.c
fs/ncpfs/ncp_fs_sb.h
fs/nfs/callback.c
fs/nfs/dir.c
fs/nfs/file.c
fs/nfsd/auth.c
fs/nfsd/export.c
fs/nfsd/fault_inject.c
fs/nfsd/nfs4callback.c
fs/nfsd/nfs4idmap.c
fs/nfsd/nfs4recover.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfsctl.c
fs/nfsd/nfssvc.c
fs/nfsd/state.h
fs/nfsd/xdr4.h
fs/nilfs2/namei.c
fs/notify/fsnotify.c
fs/ntfs/file.c
fs/ocfs2/blockcheck.c
fs/ocfs2/dlm/dlmast.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/export.c
fs/ocfs2/inode.c
fs/ocfs2/ioctl.c
fs/ocfs2/move_extents.c
fs/ocfs2/namei.c
fs/ocfs2/symlink.c
fs/ocfs2/symlink.h
fs/open.c
fs/pipe.c
fs/pnode.c
fs/proc_namespace.c
fs/readdir.c
fs/reiserfs/inode.c
fs/reiserfs/journal.c
fs/reiserfs/reiserfs.h
fs/reiserfs/resize.c
fs/reiserfs/super.c
fs/signalfd.c
fs/splice.c
fs/statfs.c
fs/sync.c
fs/ubifs/dir.c
fs/udf/namei.c
fs/utimes.c
fs/xattr.c
fs/xfs/kmem.c
fs/xfs/kmem.h
fs/xfs/xfs_export.c
fs/xfs/xfs_file.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
include/asm-generic/posix_types.h
include/linux/errno.h
include/linux/exportfs.h
include/linux/fs.h
include/linux/fsnotify_backend.h
include/linux/jbd2.h
include/linux/jbd_common.h
include/linux/lglock.h
include/linux/mm.h
include/linux/security.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svcauth.h
include/linux/sunrpc/svcauth_gss.h
include/linux/types.h
ipc/shm.c
kernel/Makefile
kernel/lglock.c [new file with mode: 0644]
mm/cleancache.c
mm/filemap.c
mm/filemap_xip.c
mm/internal.h
mm/mmap.c
mm/mremap.c
mm/nommu.c
mm/shmem.c
mm/util.c
net/sched/sch_atm.c
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/rpcb_clnt.c
net/sunrpc/svc.c
net/sunrpc/svc_xprt.c
net/sunrpc/svcauth_unix.c
security/apparmor/lsm.c
security/capability.c
security/commoncap.c
security/security.c
security/selinux/hooks.c
security/selinux/selinuxfs.c
security/smack/smack_lsm.c

index d449e63..8e2da1e 100644 (file)
@@ -61,6 +61,7 @@ ata *);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*removexattr) (struct dentry *, const char *);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+       void (*update_time)(struct inode *, struct timespec *, int);
 
 locking rules:
        all may block
@@ -87,6 +88,8 @@ getxattr:     no
 listxattr:     no
 removexattr:   yes
 fiemap:                no
+update_time:   no
+
        Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
        cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
index ef19f91..efd23f4 100644 (file)
@@ -363,6 +363,7 @@ struct inode_operations {
        ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*removexattr) (struct dentry *, const char *);
+       void (*update_time)(struct inode *, struct timespec *, int);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -471,6 +472,9 @@ otherwise noted.
   removexattr: called by the VFS to remove an extended attribute from
        a file. This method is called by removexattr(2) system call.
 
+  update_time: called by the VFS to update a specific time or the i_version of
+       an inode.  If this is not defined the VFS will update the inode itself
+       and call mark_inode_dirty_sync.
 
 The Address Space Object
 ========================
index 24779fc..5a8a483 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned int   __kernel_ino_t;
 #define __kernel_ino_t __kernel_ino_t
 
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned long  __kernel_sigset_t;      /* at least 32 bits */
 
 #include <asm-generic/posix_types.h>
index efdf990..d2de9cb 100644 (file)
@@ -22,9 +22,6 @@
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short         __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short         __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 74667bf..9ba9e74 100644 (file)
@@ -17,9 +17,6 @@
 typedef unsigned short  __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short  __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 41bc187..1bd3436 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned int __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 234891c..ce4e517 100644 (file)
@@ -15,9 +15,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short  __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 3f34cb4..fe512af 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index bc4c34e..91e62ba 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 7323ab9..99ee1d6 100644 (file)
@@ -1,9 +1,6 @@
 #ifndef _ASM_IA64_POSIX_TYPES_H
 #define _ASM_IA64_POSIX_TYPES_H
 
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned long  __kernel_sigset_t;      /* at least 32 bits */
 
 #include <asm-generic/posix_types.h>
index f00ba02..d7f558c 100644 (file)
@@ -604,12 +604,6 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
        spin_unlock(&(x)->ctx_lock);
 }
 
-static inline unsigned long 
-pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
-{
-       return get_unmapped_area(file, addr, len, pgoff, flags);
-}
-
 /* forward declaration */
 static const struct dentry_operations pfmfs_dentry_operations;
 
@@ -2333,8 +2327,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
        down_write(&task->mm->mmap_sem);
 
        /* find some free area in address space, must have mmap sem held */
-       vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
-       if (vma->vm_start == 0UL) {
+       vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
+       if (IS_ERR_VALUE(vma->vm_start)) {
                DPRINT(("Cannot find unmapped area for size %ld\n", size));
                up_write(&task->mm->mmap_sem);
                goto error;
index 609d500..d9439ef 100644 (file)
@@ -171,22 +171,9 @@ asmlinkage unsigned long
 ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
             unsigned long new_addr)
 {
-       extern unsigned long do_mremap (unsigned long addr,
-                                       unsigned long old_len,
-                                       unsigned long new_len,
-                                       unsigned long flags,
-                                       unsigned long new_addr);
-
-       down_write(&current->mm->mmap_sem);
-       {
-               addr = do_mremap(addr, old_len, new_len, flags, new_addr);
-       }
-       up_write(&current->mm->mmap_sem);
-
-       if (IS_ERR((void *) addr))
-               return addr;
-
-       force_successful_syscall_return();
+       addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
+       if (!IS_ERR((void *) addr))
+               force_successful_syscall_return();
        return addr;
 }
 
index 0195850..236de26 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 6373093..cf4dbf7 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index e0308dc..fa03ec3 100644 (file)
  * assume GCC is being used.
  */
 
-#if (_MIPS_SZLONG == 64)
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-#endif
-
 typedef long           __kernel_daddr_t;
 #define __kernel_daddr_t __kernel_daddr_t
 
index 6e00f75..fe9a4c3 100644 (file)
@@ -20,7 +20,7 @@ struct stat {
        long            st_pad1[3];             /* Reserved for network id */
        ino_t           st_ino;
        mode_t          st_mode;
-       nlink_t         st_nlink;
+       __u32           st_nlink;
        uid_t           st_uid;
        gid_t           st_gid;
        unsigned        st_rdev;
@@ -55,7 +55,7 @@ struct stat64 {
        unsigned long long      st_ino;
 
        mode_t          st_mode;
-       nlink_t         st_nlink;
+       __u32           st_nlink;
 
        uid_t           st_uid;
        gid_t           st_gid;
@@ -96,7 +96,7 @@ struct stat {
        unsigned long           st_ino;
 
        mode_t                  st_mode;
-       nlink_t                 st_nlink;
+       __u32                   st_nlink;
 
        uid_t                   st_uid;
        gid_t                   st_gid;
index ab50618..d31eeea 100644 (file)
@@ -20,9 +20,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 5212b03..b934425 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short         __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short         __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 9d5fbbc..d76fbda 100644 (file)
@@ -7,7 +7,7 @@ struct stat {
        unsigned int    st_dev;         /* dev_t is 32 bits on parisc */
        ino_t           st_ino;         /* 32 bits */
        mode_t          st_mode;        /* 16 bits */
-       nlink_t         st_nlink;       /* 16 bits */
+       unsigned short  st_nlink;       /* 16 bits */
        unsigned short  st_reserved1;   /* old st_uid */
        unsigned short  st_reserved2;   /* old st_gid */
        unsigned int    st_rdev;
@@ -42,7 +42,7 @@ struct hpux_stat64 {
        unsigned int    st_dev;         /* dev_t is 32 bits on parisc */
        ino_t           st_ino;         /* 32 bits */
        mode_t          st_mode;        /* 16 bits */
-       nlink_t         st_nlink;       /* 16 bits */
+       unsigned short  st_nlink;       /* 16 bits */
        unsigned short  st_reserved1;   /* old st_uid */
        unsigned short  st_reserved2;   /* old st_gid */
        unsigned int    st_rdev;
index f139325..2958c5b 100644 (file)
@@ -16,9 +16,6 @@ typedef int           __kernel_ssize_t;
 typedef long           __kernel_ptrdiff_t;
 #define __kernel_size_t __kernel_size_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef short          __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 #endif
index e4edc51..10cfb55 100644 (file)
@@ -30,11 +30,11 @@ struct stat {
        unsigned long   st_dev;
        ino_t           st_ino;
 #ifdef __powerpc64__
-       nlink_t         st_nlink;
+       unsigned short  st_nlink;
        mode_t          st_mode;
 #else
        mode_t          st_mode;
-       nlink_t         st_nlink;
+       unsigned short  st_nlink;
 #endif
        uid_t           st_uid;
        gid_t           st_gid;
index edf8527..7be104c 100644 (file)
@@ -24,7 +24,6 @@ typedef unsigned short        __kernel_old_dev_t;
 
 typedef unsigned long   __kernel_ino_t;
 typedef unsigned short  __kernel_mode_t;
-typedef unsigned short  __kernel_nlink_t;
 typedef unsigned short  __kernel_ipc_pid_t;
 typedef unsigned short  __kernel_uid_t;
 typedef unsigned short  __kernel_gid_t;
@@ -35,7 +34,6 @@ typedef int             __kernel_ptrdiff_t;
 
 typedef unsigned int    __kernel_ino_t;
 typedef unsigned int    __kernel_mode_t;
-typedef unsigned int    __kernel_nlink_t;
 typedef int             __kernel_ipc_pid_t;
 typedef unsigned int    __kernel_uid_t;
 typedef unsigned int    __kernel_gid_t;
@@ -47,7 +45,6 @@ typedef unsigned long   __kernel_sigset_t;      /* at least 32 bits */
 
 #define __kernel_ino_t  __kernel_ino_t
 #define __kernel_mode_t __kernel_mode_t
-#define __kernel_nlink_t __kernel_nlink_t
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 #define __kernel_uid_t __kernel_uid_t
 #define __kernel_gid_t __kernel_gid_t
index abda584..ba0bdc4 100644 (file)
@@ -3,8 +3,6 @@
 
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 typedef unsigned short __kernel_uid_t;
index fcda07b..244f7e9 100644 (file)
@@ -3,8 +3,6 @@
 
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 typedef unsigned short __kernel_uid_t;
index 3070f25..156220e 100644 (file)
@@ -9,8 +9,6 @@
 
 #if defined(__sparc__) && defined(__arch64__)
 /* sparc 64 bit */
-typedef unsigned int           __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 
 typedef unsigned short                __kernel_old_uid_t;
 typedef unsigned short         __kernel_old_gid_t;
@@ -38,9 +36,6 @@ typedef unsigned short         __kernel_gid_t;
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef short                  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef long                   __kernel_daddr_t;
 #define __kernel_daddr_t __kernel_daddr_t
 
index 3ee51f1..275f74f 100644 (file)
@@ -580,16 +580,9 @@ SYSCALL_DEFINE5(64_mremap, unsigned long, addr,    unsigned long, old_len,
                unsigned long, new_len, unsigned long, flags,
                unsigned long, new_addr)
 {
-       unsigned long ret = -EINVAL;
-
        if (test_thread_flag(TIF_32BIT))
-               goto out;
-
-       down_write(&current->mm->mmap_sem);
-       ret = do_mremap(addr, old_len, new_len, flags, new_addr);
-       up_write(&current->mm->mmap_sem);
-out:
-       return ret;       
+               return -EINVAL;
+       return sys_mremap(addr, old_len, new_len, flags, new_addr);
 }
 
 /* we come to here via sys_nis_syscall so it can setup the regs argument */
index 69adc08..6e74450 100644 (file)
@@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t;
 typedef __kernel_mode_t compat_mode_t;
 typedef __kernel_dev_t compat_dev_t;
 typedef __kernel_loff_t compat_loff_t;
-typedef __kernel_nlink_t compat_nlink_t;
 typedef __kernel_ipc_pid_t compat_ipc_pid_t;
 typedef __kernel_daddr_t compat_daddr_t;
 typedef __kernel_fsid_t        compat_fsid_t;
index 99f262e..8e52505 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index ba29b2e..72b5e72 100644 (file)
@@ -42,7 +42,7 @@ struct device *soc_device_to_device(struct soc_device *soc_dev)
        return &soc_dev->dev;
 }
 
-static mode_t soc_attribute_mode(struct kobject *kobj,
+static umode_t soc_attribute_mode(struct kobject *kobj,
                                  struct attribute *attr,
                                  int index)
 {
index f920fb5..fa94391 100644 (file)
@@ -130,11 +130,10 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
                return -EINVAL;
 
        /* This is all entirely broken */
-       down_write(&current->mm->mmap_sem);
        old_fops = file_priv->filp->f_op;
        file_priv->filp->f_op = &i810_buffer_fops;
        dev_priv->mmap_buffer = buf;
-       buf_priv->virtual = (void *)do_mmap(file_priv->filp, 0, buf->total,
+       buf_priv->virtual = (void *)vm_mmap(file_priv->filp, 0, buf->total,
                                            PROT_READ | PROT_WRITE,
                                            MAP_SHARED, buf->bus_address);
        dev_priv->mmap_buffer = NULL;
@@ -145,7 +144,6 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
                retcode = PTR_ERR(buf_priv->virtual);
                buf_priv->virtual = NULL;
        }
-       up_write(&current->mm->mmap_sem);
 
        return retcode;
 }
index a1e6c99..e3dd2a1 100644 (file)
@@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
        return current_fsgid();
 }
 
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
-       struct dentry *dentry;
-
-       spin_lock(&inode->i_lock);
-       /* Directory should have only one entry. */
-       BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-       dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-       spin_unlock(&inode->i_lock);
-       return dentry;
-}
-
 static int v9fs_test_inode_dotl(struct inode *inode, void *data)
 {
        struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        if (dir->i_mode & S_ISGID)
                omode |= S_ISGID;
 
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
@@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
 
        v9ses = v9fs_inode2v9ses(dir);
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid))
                return PTR_ERR(dfid);
@@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
                return -EINVAL;
 
        v9ses = v9fs_inode2v9ses(dir);
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
index 45a0ce4..1fceb32 100644 (file)
 #define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
 #define AFFS_BLOCK(sb, bh, blk)                (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
 
-#ifdef __LITTLE_ENDIAN
-#define BO_EXBITS      0x18UL
-#elif defined(__BIG_ENDIAN)
-#define BO_EXBITS      0x00UL
-#else
-#error Endianness must be known for affs to work.
-#endif
-
 #define AFFS_HEAD(bh)          ((struct affs_head *)(bh)->b_data)
 #define AFFS_TAIL(sb, bh)      ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
 #define AFFS_ROOT_HEAD(bh)     ((struct affs_root_head *)(bh)->b_data)
index 8c7c8b8..55c4c76 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx)
        info->mmap_size = nr_pages * PAGE_SIZE;
        dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
        down_write(&ctx->mm->mmap_sem);
-       info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
-                                 PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
-                                 0);
+       info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
+                                       PROT_READ|PROT_WRITE,
+                                       MAP_ANONYMOUS|MAP_PRIVATE, 0);
        if (IS_ERR((void *)info->mmap_base)) {
                up_write(&ctx->mm->mmap_sem);
                info->mmap_size = 0;
index 584620e..0da9095 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -176,6 +176,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
                        return -EPERM;
        }
 
+       if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
+               if (attr->ia_size != inode->i_size)
+                       inode_inc_iversion(inode);
+       }
+
        if ((ia_valid & ATTR_MODE)) {
                umode_t amode = attr->ia_mode;
                /* Flag setting protected by i_mutex */
index e658dd1..1b52956 100644 (file)
@@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        if (!size)
                return addr;
 
-       down_write(&current->mm->mmap_sem);
        /*
        * total_size is the size of the ELF (interpreter) image.
        * The _first_ mmap needs to know the full size, otherwise
@@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        */
        if (total_size) {
                total_size = ELF_PAGEALIGN(total_size);
-               map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+               map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
                if (!BAD_ADDR(map_addr))
-                       do_munmap(current->mm, map_addr+size, total_size-size);
+                       vm_munmap(map_addr+size, total_size-size);
        } else
-               map_addr = do_mmap(filep, addr, size, prot, type, off);
+               map_addr = vm_mmap(filep, addr, size, prot, type, off);
 
-       up_write(&current->mm->mmap_sem);
        return(map_addr);
 }
 
index 6b2daf9..178cb70 100644 (file)
@@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                                realdatastart = (unsigned long) -ENOMEM;
                        printk("Unable to allocate RAM for process data, errno %d\n",
                                        (int)-realdatastart);
-                       do_munmap(current->mm, textpos, text_len);
+                       vm_munmap(textpos, text_len);
                        ret = realdatastart;
                        goto err;
                }
@@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read data+bss, errno %d\n", (int)-result);
-                       do_munmap(current->mm, textpos, text_len);
-                       do_munmap(current->mm, realdatastart, len);
+                       vm_munmap(textpos, text_len);
+                       vm_munmap(realdatastart, len);
                        ret = result;
                        goto err;
                }
@@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read code+data+bss, errno %d\n",(int)-result);
-                       do_munmap(current->mm, textpos, text_len + data_len + extra +
+                       vm_munmap(textpos, text_len + data_len + extra +
                                MAX_SHARED_LIBS * sizeof(unsigned long));
                        ret = result;
                        goto err;
index 89b156d..761e2cd 100644 (file)
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
                if (ret > 0) {
                        /* we need an acl */
                        ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+               } else {
+                       cache_no_acl(inode);
                }
+       } else {
+               cache_no_acl(inode);
        }
 failed:
        posix_acl_release(acl);
index bcec067..3f75895 100644 (file)
 #include "delayed-ref.h"
 #include "locking.h"
 
+struct extent_inode_elem {
+       u64 inum;
+       u64 offset;
+       struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+                               struct btrfs_file_extent_item *fi,
+                               u64 extent_item_pos,
+                               struct extent_inode_elem **eie)
+{
+       u64 data_offset;
+       u64 data_len;
+       struct extent_inode_elem *e;
+
+       data_offset = btrfs_file_extent_offset(eb, fi);
+       data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+       if (extent_item_pos < data_offset ||
+           extent_item_pos >= data_offset + data_len)
+               return 1;
+
+       e = kmalloc(sizeof(*e), GFP_NOFS);
+       if (!e)
+               return -ENOMEM;
+
+       e->next = *eie;
+       e->inum = key->objectid;
+       e->offset = key->offset + (extent_item_pos - data_offset);
+       *eie = e;
+
+       return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+                               u64 extent_item_pos,
+                               struct extent_inode_elem **eie)
+{
+       u64 disk_byte;
+       struct btrfs_key key;
+       struct btrfs_file_extent_item *fi;
+       int slot;
+       int nritems;
+       int extent_type;
+       int ret;
+
+       /*
+        * from the shared data ref, we only have the leaf but we need
+        * the key. thus, we must look into all items and see that we
+        * find one (some) with a reference to our extent item.
+        */
+       nritems = btrfs_header_nritems(eb);
+       for (slot = 0; slot < nritems; ++slot) {
+               btrfs_item_key_to_cpu(eb, &key, slot);
+               if (key.type != BTRFS_EXTENT_DATA_KEY)
+                       continue;
+               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+               extent_type = btrfs_file_extent_type(eb, fi);
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+               disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+               if (disk_byte != wanted_disk_byte)
+                       continue;
+
+               ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
 /*
  * this structure records all encountered refs on the way up to the root
  */
 struct __prelim_ref {
        struct list_head list;
        u64 root_id;
-       struct btrfs_key key;
+       struct btrfs_key key_for_search;
        int level;
        int count;
+       struct extent_inode_elem *inode_list;
        u64 parent;
        u64 wanted_disk_byte;
 };
 
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ *   block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    -   |     -
+ *      key to resolve |    -   |     y    |    y   |     y
+ *  tree block logical |    -   |     -    |    -   |     -
+ *  root for resolving |    y   |     y    |    y   |     y
+ *
+ * - column 1:       we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    y   |     -
+ *      key to resolve |    -   |     -    |    -   |     y
+ *  tree block logical |    y   |     y    |    y   |     y
+ *  root for resolving |    -   |     y    |    y   |     y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2:    we take the first key from the block to find the parent
+ *                (see __add_missing_keys)
+ * - column 4:    we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
 static int __add_prelim_ref(struct list_head *head, u64 root_id,
-                           struct btrfs_key *key, int level, u64 parent,
-                           u64 wanted_disk_byte, int count)
+                           struct btrfs_key *key, int level,
+                           u64 parent, u64 wanted_disk_byte, int count)
 {
        struct __prelim_ref *ref;
 
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 
        ref->root_id = root_id;
        if (key)
-               ref->key = *key;
+               ref->key_for_search = *key;
        else
-               memset(&ref->key, 0, sizeof(ref->key));
+               memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
 
+       ref->inode_list = NULL;
        ref->level = level;
        ref->count = count;
        ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 }
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-                               struct ulist *parents,
-                               struct extent_buffer *eb, int level,
-                               u64 wanted_objectid, u64 wanted_disk_byte)
+                               struct ulist *parents, int level,
+                               struct btrfs_key *key, u64 wanted_disk_byte,
+                               const u64 *extent_item_pos)
 {
        int ret;
-       int slot;
+       int slot = path->slots[level];
+       struct extent_buffer *eb = path->nodes[level];
        struct btrfs_file_extent_item *fi;
-       struct btrfs_key key;
+       struct extent_inode_elem *eie = NULL;
        u64 disk_byte;
+       u64 wanted_objectid = key->objectid;
 
 add_parent:
-       ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+       if (level == 0 && extent_item_pos) {
+               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+               ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
+               if (ret < 0)
+                       return ret;
+       }
+       ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
        if (ret < 0)
                return ret;
 
@@ -89,6 +211,7 @@ add_parent:
         * repeat this until we don't find any additional EXTENT_DATA items.
         */
        while (1) {
+               eie = NULL;
                ret = btrfs_next_leaf(root, path);
                if (ret < 0)
                        return ret;
@@ -97,9 +220,9 @@ add_parent:
 
                eb = path->nodes[0];
                for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
-                       btrfs_item_key_to_cpu(eb, &key, slot);
-                       if (key.objectid != wanted_objectid ||
-                           key.type != BTRFS_EXTENT_DATA_KEY)
+                       btrfs_item_key_to_cpu(eb, key, slot);
+                       if (key->objectid != wanted_objectid ||
+                           key->type != BTRFS_EXTENT_DATA_KEY)
                                return 0;
                        fi = btrfs_item_ptr(eb, slot,
                                                struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
  */
 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                                        int search_commit_root,
+                                       u64 time_seq,
                                        struct __prelim_ref *ref,
-                                       struct ulist *parents)
+                                       struct ulist *parents,
+                                       const u64 *extent_item_pos)
 {
        struct btrfs_path *path;
        struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                goto out;
 
        path->lowest_level = level;
-       ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+       ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
        pr_debug("search slot in root %llu (level %d, ref count %d) returned "
                 "%d for key (%llu %u %llu)\n",
                 (unsigned long long)ref->root_id, level, ref->count, ret,
-                (unsigned long long)ref->key.objectid, ref->key.type,
-                (unsigned long long)ref->key.offset);
+                (unsigned long long)ref->key_for_search.objectid,
+                ref->key_for_search.type,
+                (unsigned long long)ref->key_for_search.offset);
        if (ret < 0)
                goto out;
 
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
        }
 
-       /* the last two parameters will only be used for level == 0 */
-       ret = add_all_parents(root, path, parents, eb, level, key.objectid,
-                               ref->wanted_disk_byte);
+       ret = add_all_parents(root, path, parents, level, &key,
+                               ref->wanted_disk_byte, extent_item_pos);
 out:
        btrfs_free_path(path);
        return ret;
@@ -191,8 +316,9 @@ out:
  * resolve all indirect backrefs from the list
  */
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-                                  int search_commit_root,
-                                  struct list_head *head)
+                                  int search_commit_root, u64 time_seq,
+                                  struct list_head *head,
+                                  const u64 *extent_item_pos)
 {
        int err;
        int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
        struct __prelim_ref *new_ref;
        struct ulist *parents;
        struct ulist_node *node;
+       struct ulist_iterator uiter;
 
        parents = ulist_alloc(GFP_NOFS);
        if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                if (ref->count == 0)
                        continue;
                err = __resolve_indirect_ref(fs_info, search_commit_root,
-                                            ref, parents);
+                                            time_seq, ref, parents,
+                                            extent_item_pos);
                if (err) {
                        if (ret == 0)
                                ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                }
 
                /* we put the first parent into the ref at hand */
-               node = ulist_next(parents, NULL);
+               ULIST_ITER_INIT(&uiter);
+               node = ulist_next(parents, &uiter);
                ref->parent = node ? node->val : 0;
+               ref->inode_list =
+                       node ? (struct extent_inode_elem *)node->aux : 0;
 
                /* additional parents require new refs being added here */
-               while ((node = ulist_next(parents, node))) {
+               while ((node = ulist_next(parents, &uiter))) {
                        new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
                        if (!new_ref) {
                                ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        }
                        memcpy(new_ref, ref, sizeof(*ref));
                        new_ref->parent = node->val;
+                       new_ref->inode_list =
+                                       (struct extent_inode_elem *)node->aux;
                        list_add(&new_ref->list, &ref->list);
                }
                ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
        return ret;
 }
 
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+                                    struct __prelim_ref *ref2)
+{
+       if (ref1->level != ref2->level)
+               return 0;
+       if (ref1->root_id != ref2->root_id)
+               return 0;
+       if (ref1->key_for_search.type != ref2->key_for_search.type)
+               return 0;
+       if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+               return 0;
+       if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+               return 0;
+       if (ref1->parent != ref2->parent)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+                             struct list_head *head)
+{
+       struct list_head *pos;
+       struct extent_buffer *eb;
+
+       list_for_each(pos, head) {
+               struct __prelim_ref *ref;
+               ref = list_entry(pos, struct __prelim_ref, list);
+
+               if (ref->parent)
+                       continue;
+               if (ref->key_for_search.type)
+                       continue;
+               BUG_ON(!ref->wanted_disk_byte);
+               eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+                                    fs_info->tree_root->leafsize, 0);
+               BUG_ON(!eb);
+               btrfs_tree_read_lock(eb);
+               if (btrfs_header_level(eb) == 0)
+                       btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+               else
+                       btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+               btrfs_tree_read_unlock(eb);
+               free_extent_buffer(eb);
+       }
+       return 0;
+}
+
 /*
  * merge two lists of backrefs and adjust counts accordingly
  *
  * mode = 1: merge identical keys, if key is set
+ *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ *           additionally, we could even add a key range for the blocks we
+ *           looked into to merge even more (-> replace unresolved refs by those
+ *           having a parent).
  * mode = 2: merge identical parents
  */
 static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
 
                ref1 = list_entry(pos1, struct __prelim_ref, list);
 
-               if (mode == 1 && ref1->key.type == 0)
-                       continue;
                for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
                     pos2 = n2, n2 = pos2->next) {
                        struct __prelim_ref *ref2;
+                       struct __prelim_ref *xchg;
 
                        ref2 = list_entry(pos2, struct __prelim_ref, list);
 
                        if (mode == 1) {
-                               if (memcmp(&ref1->key, &ref2->key,
-                                          sizeof(ref1->key)) ||
-                                   ref1->level != ref2->level ||
-                                   ref1->root_id != ref2->root_id)
+                               if (!ref_for_same_block(ref1, ref2))
                                        continue;
+                               if (!ref1->parent && ref2->parent) {
+                                       xchg = ref1;
+                                       ref1 = ref2;
+                                       ref2 = xchg;
+                               }
                                ref1->count += ref2->count;
                        } else {
                                if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
  * smaller or equal that seq to the list
  */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-                             struct btrfs_key *info_key,
                              struct list_head *prefs)
 {
        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
        struct rb_node *n = &head->node.rb_node;
+       struct btrfs_key key;
+       struct btrfs_key op_key = {0};
        int sgn;
        int ret = 0;
 
        if (extent_op && extent_op->update_key)
-               btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+               btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
 
        while ((n = rb_prev(n))) {
                struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                        struct btrfs_delayed_tree_ref *ref;
 
                        ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                       ret = __add_prelim_ref(prefs, ref->root, &op_key,
                                               ref->level + 1, 0, node->bytenr,
                                               node->ref_mod * sgn);
                        break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                        struct btrfs_delayed_tree_ref *ref;
 
                        ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                       ret = __add_prelim_ref(prefs, ref->root, NULL,
                                               ref->level + 1, ref->parent,
                                               node->bytenr,
                                               node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                }
                case BTRFS_EXTENT_DATA_REF_KEY: {
                        struct btrfs_delayed_data_ref *ref;
-                       struct btrfs_key key;
-
                        ref = btrfs_delayed_node_to_data_ref(node);
 
                        key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                }
                case BTRFS_SHARED_DATA_REF_KEY: {
                        struct btrfs_delayed_data_ref *ref;
-                       struct btrfs_key key;
 
                        ref = btrfs_delayed_node_to_data_ref(node);
 
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  */
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                             struct btrfs_path *path, u64 bytenr,
-                            struct btrfs_key *info_key, int *info_level,
-                            struct list_head *prefs)
+                            int *info_level, struct list_head *prefs)
 {
        int ret = 0;
        int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
         * enumerate all inline refs
         */
        leaf = path->nodes[0];
-       slot = path->slots[0] - 1;
+       slot = path->slots[0];
 
        item_size = btrfs_item_size_nr(leaf, slot);
        BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                struct btrfs_tree_block_info *info;
-               struct btrfs_disk_key disk_key;
 
                info = (struct btrfs_tree_block_info *)ptr;
                *info_level = btrfs_tree_block_level(leaf, info);
-               btrfs_tree_block_key(leaf, info, &disk_key);
-               btrfs_disk_key_to_cpu(info_key, &disk_key);
                ptr += sizeof(struct btrfs_tree_block_info);
                BUG_ON(ptr > end);
        } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
                switch (type) {
                case BTRFS_SHARED_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, 0, info_key,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                *info_level + 1, offset,
                                                bytenr, 1);
                        break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                        break;
                }
                case BTRFS_TREE_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, offset, info_key,
-                                              *info_level + 1, 0, bytenr, 1);
+                       ret = __add_prelim_ref(prefs, offset, NULL,
+                                              *info_level + 1, 0,
+                                              bytenr, 1);
                        break;
                case BTRFS_EXTENT_DATA_REF_KEY: {
                        struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                        key.type = BTRFS_EXTENT_DATA_KEY;
                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
                        root = btrfs_extent_data_ref_root(leaf, dref);
-                       ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
-                                               count);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                              bytenr, count);
                        break;
                }
                default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
  */
 static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                            struct btrfs_path *path, u64 bytenr,
-                           struct btrfs_key *info_key, int info_level,
-                           struct list_head *prefs)
+                           int info_level, struct list_head *prefs)
 {
        struct btrfs_root *extent_root = fs_info->extent_root;
        int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 
                switch (key.type) {
                case BTRFS_SHARED_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, 0, info_key,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                info_level + 1, key.offset,
                                                bytenr, 1);
                        break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                        break;
                }
                case BTRFS_TREE_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, key.offset, info_key,
-                                               info_level + 1, 0, bytenr, 1);
+                       ret = __add_prelim_ref(prefs, key.offset, NULL,
+                                              info_level + 1, 0,
+                                              bytenr, 1);
                        break;
                case BTRFS_EXTENT_DATA_REF_KEY: {
                        struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
                        root = btrfs_extent_data_ref_root(leaf, dref);
                        ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-                                               bytenr, count);
+                                              bytenr, count);
                        break;
                }
                default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
-                            u64 seq, struct ulist *refs, struct ulist *roots)
+                            u64 delayed_ref_seq, u64 time_seq,
+                            struct ulist *refs, struct ulist *roots,
+                            const u64 *extent_item_pos)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
-       struct btrfs_key info_key = { 0 };
        struct btrfs_delayed_ref_root *delayed_refs = NULL;
        struct btrfs_delayed_ref_head *head;
        int info_level = 0;
@@ -645,7 +830,7 @@ again:
                                btrfs_put_delayed_ref(&head->node);
                                goto again;
                        }
-                       ret = __add_delayed_refs(head, seq, &info_key,
+                       ret = __add_delayed_refs(head, delayed_ref_seq,
                                                 &prefs_delayed);
                        if (ret) {
                                spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
                struct extent_buffer *leaf;
                int slot;
 
+               path->slots[0]--;
                leaf = path->nodes[0];
-               slot = path->slots[0] - 1;
+               slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid == bytenr &&
                    key.type == BTRFS_EXTENT_ITEM_KEY) {
                        ret = __add_inline_refs(fs_info, path, bytenr,
-                                               &info_key, &info_level, &prefs);
+                                               &info_level, &prefs);
                        if (ret)
                                goto out;
-                       ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                       ret = __add_keyed_refs(fs_info, path, bytenr,
                                               info_level, &prefs);
                        if (ret)
                                goto out;
@@ -676,21 +862,18 @@ again:
        }
        btrfs_release_path(path);
 
-       /*
-        * when adding the delayed refs above, the info_key might not have
-        * been known yet. Go over the list and replace the missing keys
-        */
-       list_for_each_entry(ref, &prefs_delayed, list) {
-               if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
-                       memcpy(&ref->key, &info_key, sizeof(ref->key));
-       }
        list_splice_init(&prefs_delayed, &prefs);
 
+       ret = __add_missing_keys(fs_info, &prefs);
+       if (ret)
+               goto out;
+
        ret = __merge_refs(&prefs, 1);
        if (ret)
                goto out;
 
-       ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
+       ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+                                     &prefs, extent_item_pos);
        if (ret)
                goto out;
 
@@ -709,7 +892,33 @@ again:
                        BUG_ON(ret < 0);
                }
                if (ref->count && ref->parent) {
-                       ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                       struct extent_inode_elem *eie = NULL;
+                       if (extent_item_pos && !ref->inode_list) {
+                               u32 bsz;
+                               struct extent_buffer *eb;
+                               bsz = btrfs_level_size(fs_info->extent_root,
+                                                       info_level);
+                               eb = read_tree_block(fs_info->extent_root,
+                                                          ref->parent, bsz, 0);
+                               BUG_ON(!eb);
+                               ret = find_extent_in_eb(eb, bytenr,
+                                                       *extent_item_pos, &eie);
+                               ref->inode_list = eie;
+                               free_extent_buffer(eb);
+                       }
+                       ret = ulist_add_merge(refs, ref->parent,
+                                             (unsigned long)ref->inode_list,
+                                             (unsigned long *)&eie, GFP_NOFS);
+                       if (!ret && extent_item_pos) {
+                               /*
+                                * we've recorded that parent, so we must extend
+                                * its inode list here
+                                */
+                               BUG_ON(!eie);
+                               while (eie->next)
+                                       eie = eie->next;
+                               eie->next = ref->inode_list;
+                       }
                        BUG_ON(ret < 0);
                }
                kfree(ref);
@@ -734,6 +943,28 @@ out:
        return ret;
 }
 
+static void free_leaf_list(struct ulist *blocks)
+{
+       struct ulist_node *node = NULL;
+       struct extent_inode_elem *eie;
+       struct extent_inode_elem *eie_next;
+       struct ulist_iterator uiter;
+
+       ULIST_ITER_INIT(&uiter);
+       while ((node = ulist_next(blocks, &uiter))) {
+               if (!node->aux)
+                       continue;
+               eie = (struct extent_inode_elem *)node->aux;
+               for (; eie; eie = eie_next) {
+                       eie_next = eie->next;
+                       kfree(eie);
+               }
+               node->aux = 0;
+       }
+
+       ulist_free(blocks);
+}
+
 /*
  * Finds all leafs with a reference to the specified combination of bytenr and
  * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
  */
 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **leafs)
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **leafs,
+                               const u64 *extent_item_pos)
 {
        struct ulist *tmp;
        int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+       ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+                               time_seq, *leafs, tmp, extent_item_pos);
        ulist_free(tmp);
 
        if (ret < 0 && ret != -ENOENT) {
-               ulist_free(*leafs);
+               free_leaf_list(*leafs);
                return ret;
        }
 
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  */
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **roots)
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **roots)
 {
        struct ulist *tmp;
        struct ulist_node *node = NULL;
+       struct ulist_iterator uiter;
        int ret;
 
        tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
+       ULIST_ITER_INIT(&uiter);
        while (1) {
-               ret = find_parent_nodes(trans, fs_info, bytenr, seq,
-                                       tmp, *roots);
+               ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+                                       time_seq, tmp, *roots, NULL);
                if (ret < 0 && ret != -ENOENT) {
                        ulist_free(tmp);
                        ulist_free(*roots);
                        return ret;
                }
-               node = ulist_next(tmp, node);
+               node = ulist_next(tmp, &uiter);
                if (!node)
                        break;
                bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        return 0;
 }
 
-static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
-                               u64 orig_extent_item_objectid,
-                               u64 extent_item_pos, u64 root,
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+                               u64 root, u64 extent_item_objectid,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
-       u64 disk_byte;
-       struct btrfs_key key;
-       struct btrfs_file_extent_item *fi;
-       struct extent_buffer *eb;
-       int slot;
-       int nritems;
+       struct extent_inode_elem *eie;
        int ret = 0;
-       int extent_type;
-       u64 data_offset;
-       u64 data_len;
-
-       eb = read_tree_block(fs_info->tree_root, logical,
-                               fs_info->tree_root->leafsize, 0);
-       if (!eb)
-               return -EIO;
-
-       /*
-        * from the shared data ref, we only have the leaf but we need
-        * the key. thus, we must look into all items and see that we
-        * find one (some) with a reference to our extent item.
-        */
-       nritems = btrfs_header_nritems(eb);
-       for (slot = 0; slot < nritems; ++slot) {
-               btrfs_item_key_to_cpu(eb, &key, slot);
-               if (key.type != BTRFS_EXTENT_DATA_KEY)
-                       continue;
-               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-               extent_type = btrfs_file_extent_type(eb, fi);
-               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
-                       continue;
-               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
-               disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-               if (disk_byte != orig_extent_item_objectid)
-                       continue;
-
-               data_offset = btrfs_file_extent_offset(eb, fi);
-               data_len = btrfs_file_extent_num_bytes(eb, fi);
-
-               if (extent_item_pos < data_offset ||
-                   extent_item_pos >= data_offset + data_len)
-                       continue;
 
+       for (eie = inode_list; eie; eie = eie->next) {
                pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
-                               "root %llu\n", orig_extent_item_objectid,
-                               key.objectid, key.offset, root);
-               ret = iterate(key.objectid,
-                               key.offset + (extent_item_pos - data_offset),
-                               root, ctx);
+                        "root %llu\n", extent_item_objectid,
+                        eie->inum, eie->offset, root);
+               ret = iterate(eie->inum, eie->offset, root, ctx);
                if (ret) {
-                       pr_debug("stopping iteration because ret=%d\n", ret);
+                       pr_debug("stopping iteration for %llu due to ret=%d\n",
+                                extent_item_objectid, ret);
                        break;
                }
        }
 
-       free_extent_buffer(eb);
-
        return ret;
 }
 
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
        struct ulist_node *root_node = NULL;
-       struct seq_list seq_elem;
+       struct seq_list seq_elem = {};
+       struct seq_list tree_mod_seq_elem = {};
+       struct ulist_iterator ref_uiter;
+       struct ulist_iterator root_uiter;
        struct btrfs_delayed_ref_root *delayed_refs = NULL;
 
        pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                spin_lock(&delayed_refs->lock);
                btrfs_get_delayed_seq(delayed_refs, &seq_elem);
                spin_unlock(&delayed_refs->lock);
+               btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
        }
 
        ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-                                  extent_item_pos, seq_elem.seq,
-                                  &refs);
-
+                                  seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+                                  &extent_item_pos);
        if (ret)
                goto out;
 
-       while (!ret && (ref_node = ulist_next(refs, ref_node))) {
-               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
-                                               seq_elem.seq, &roots);
+       ULIST_ITER_INIT(&ref_uiter);
+       while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+                                               seq_elem.seq,
+                                               tree_mod_seq_elem.seq, &roots);
                if (ret)
                        break;
-               while (!ret && (root_node = ulist_next(roots, root_node))) {
-                       pr_debug("root %llu references leaf %llu\n",
-                                       root_node->val, ref_node->val);
-                       ret = iterate_leaf_refs(fs_info, ref_node->val,
-                                               extent_item_objectid,
-                                               extent_item_pos, root_node->val,
-                                               iterate, ctx);
+               ULIST_ITER_INIT(&root_uiter);
+               while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+                       pr_debug("root %llu references leaf %llu, data list "
+                                "%#lx\n", root_node->val, ref_node->val,
+                                ref_node->aux);
+                       ret = iterate_leaf_refs(
+                               (struct extent_inode_elem *)ref_node->aux,
+                               root_node->val, extent_item_objectid,
+                               iterate, ctx);
                }
+               ulist_free(roots);
+               roots = NULL;
        }
 
-       ulist_free(refs);
+       free_leaf_list(refs);
        ulist_free(roots);
 out:
        if (!search_commit_root) {
+               btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
                btrfs_put_delayed_seq(delayed_refs, &seq_elem);
                btrfs_end_transaction(trans, fs_info->extent_root);
        }
index 57ea2e9..c18d8ac 100644 (file)
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **roots);
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **roots);
 
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
index 9b9b15f..e616f88 100644 (file)
 #include "ordered-data.h"
 #include "delayed-inode.h"
 
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE         0
+#define BTRFS_INODE_ORPHAN_META_RESERVED       1
+#define BTRFS_INODE_DUMMY                      2
+#define BTRFS_INODE_IN_DEFRAG                  3
+#define BTRFS_INODE_DELALLOC_META_RESERVED     4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM            5
+
 /* in memory btrfs inode */
 struct btrfs_inode {
        /* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
 
-       /* for keeping track of orphaned inodes */
-       struct list_head i_orphan;
-
        /* list of all the delalloc inodes in the FS.  There are times we need
         * to write all the delalloc pages to disk, and this list is used
         * to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
 
+       unsigned long runtime_flags;
+
        /* full 64 bit generation number, struct vfs_inode doesn't have a big
         * enough field for this.
         */
        u64 generation;
 
-       /* sequence number for NFS changes */
-       u64 sequence;
-
        /*
         * transid of the trans_handle that last modified this inode
         */
@@ -145,22 +155,9 @@ struct btrfs_inode {
        unsigned reserved_extents;
 
        /*
-        * ordered_data_close is set by truncate when a file that used
-        * to have good data has been truncated to zero.  When it is set
-        * the btrfs file release call will add this inode to the
-        * ordered operations list so that we make sure to flush out any
-        * new data the application may have written before commit.
-        */
-       unsigned ordered_data_close:1;
-       unsigned orphan_meta_reserved:1;
-       unsigned dummy_inode:1;
-       unsigned in_defrag:1;
-       unsigned delalloc_meta_reserved:1;
-
-       /*
         * always compress this one file
         */
-       unsigned force_compress:4;
+       unsigned force_compress;
 
        struct btrfs_delayed_node *delayed_node;
 
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
        return false;
 }
 
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret = 0;
+
+       mutex_lock(&root->log_mutex);
+       if (BTRFS_I(inode)->logged_trans == generation &&
+           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+               ret = 1;
+       mutex_unlock(&root->log_mutex);
+       return ret;
+}
+
 #endif
index c053e90..9cebb1f 100644 (file)
 #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
 #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)   /* in characters,
                                                         * excluding " [...]" */
-#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
-
 #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
 
 /*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
        u64 dev_bytenr;         /* physical bytenr on device */
        u32 len;
        struct btrfsic_dev_state *dev;
-       char *data;
-       struct buffer_head *bh; /* do not use if set to NULL */
+       char **datav;
+       struct page **pagev;
+       void *mem_to_free;
 };
 
 /* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
        struct btrfs_root *root;
        u64 max_superblock_generation;
        struct btrfsic_block *latest_superblock;
+       u32 metablock_size;
+       u32 datablock_size;
 };
 
 static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 static int btrfsic_process_metablock(struct btrfsic_state *state,
                                     struct btrfsic_block *block,
                                     struct btrfsic_block_data_ctx *block_ctx,
-                                    struct btrfs_header *hdr,
                                     int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+       struct btrfsic_block_data_ctx *block_ctx,
+       void *dst, u32 offset, size_t len);
 static int btrfsic_create_link_to_next_block(
                struct btrfsic_state *state,
                struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
                              struct btrfsic_block_data_ctx *block_ctx);
 static void btrfsic_dump_database(struct btrfsic_state *state);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    const u8 *data, unsigned int size);
+                                    char **datav, unsigned int num_pages);
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr, u8 *mapped_data,
-                                         unsigned int len, struct bio *bio,
-                                         int *bio_is_patched,
+                                         u64 dev_bytenr, char **mapped_datav,
+                                         unsigned int num_pages,
+                                         struct bio *bio, int *bio_is_patched,
                                          struct buffer_head *bh,
                                          int submit_bio_bh_rw);
 static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
 static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                                           u64 bytenr,
                                           struct btrfsic_dev_state *dev_state,
-                                          u64 dev_bytenr, char *data);
+                                          u64 dev_bytenr);
 
 static struct mutex btrfsic_mutex;
 static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
        int pass;
 
        BUG_ON(NULL == state);
-       selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+       selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
        if (NULL == selected_super) {
                printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
                return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                        struct btrfsic_block *next_block;
                        struct btrfsic_block_data_ctx tmp_next_block_ctx;
                        struct btrfsic_block_link *l;
-                       struct btrfs_header *hdr;
 
-                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               state->metablock_size,
                                                &tmp_next_block_ctx,
                                                mirror_num);
                        if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                        BUG_ON(NULL == l);
 
                        ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-                       if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                       if (ret < (int)PAGE_CACHE_SIZE) {
                                printk(KERN_INFO
                                       "btrfsic: read @logical %llu failed!\n",
                                       (unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                                return -1;
                        }
 
-                       hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
                        ret = btrfsic_process_metablock(state,
                                                        next_block,
                                                        &tmp_next_block_ctx,
-                                                       hdr,
                                                        BTRFS_MAX_LEVEL + 3, 1);
                        btrfsic_release_block_ctx(&tmp_next_block_ctx);
                }
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
 
        /* super block bytenr is always the unmapped device bytenr */
        dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-       bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+       if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+               return -1;
+       bh = __bread(superblock_bdev, dev_bytenr / 4096,
+                    BTRFS_SUPER_INFO_SIZE);
        if (NULL == bh)
                return -1;
        super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
        if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
            strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
                    sizeof(super_tmp->magic)) ||
-           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+           btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+           btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+           btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
                brelse(bh);
                return 0;
        }
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
                        struct btrfsic_block_data_ctx tmp_next_block_ctx;
                        struct btrfsic_block_link *l;
 
-                       if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       if (btrfsic_map_block(state, next_bytenr,
+                                             state->metablock_size,
                                              &tmp_next_block_ctx,
                                              mirror_num)) {
                                printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
                struct btrfsic_state *state,
                struct btrfsic_block *const first_block,
                struct btrfsic_block_data_ctx *const first_block_ctx,
-               struct btrfs_header *const first_hdr,
                int first_limit_nesting, int force_iodone_flag)
 {
        struct btrfsic_stack_frame initial_stack_frame = { 0 };
        struct btrfsic_stack_frame *sf;
        struct btrfsic_stack_frame *next_stack;
+       struct btrfs_header *const first_hdr =
+               (struct btrfs_header *)first_block_ctx->datav[0];
 
+       BUG_ON(!first_hdr);
        sf = &initial_stack_frame;
        sf->error = 0;
        sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
                }
 
                if (sf->i < sf->nr) {
-                       struct btrfs_item *disk_item = leafhdr->items + sf->i;
-                       struct btrfs_disk_key *disk_key = &disk_item->key;
+                       struct btrfs_item disk_item;
+                       u32 disk_item_offset =
+                               (uintptr_t)(leafhdr->items + sf->i) -
+                               (uintptr_t)leafhdr;
+                       struct btrfs_disk_key *disk_key;
                        u8 type;
-                       const u32 item_offset = le32_to_cpu(disk_item->offset);
+                       u32 item_offset;
 
+                       if (disk_item_offset + sizeof(struct btrfs_item) >
+                           sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+                               printk(KERN_INFO
+                                      "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+                                      sf->block_ctx->start,
+                                      sf->block_ctx->dev->name);
+                               goto one_stack_frame_backwards;
+                       }
+                       btrfsic_read_from_block_data(sf->block_ctx,
+                                                    &disk_item,
+                                                    disk_item_offset,
+                                                    sizeof(struct btrfs_item));
+                       item_offset = le32_to_cpu(disk_item.offset);
+                       disk_key = &disk_item.key;
                        type = disk_key->type;
 
                        if (BTRFS_ROOT_ITEM_KEY == type) {
-                               const struct btrfs_root_item *const root_item =
-                                   (struct btrfs_root_item *)
-                                   (sf->block_ctx->data +
-                                    offsetof(struct btrfs_leaf, items) +
-                                    item_offset);
-                               const u64 next_bytenr =
-                                   le64_to_cpu(root_item->bytenr);
+                               struct btrfs_root_item root_item;
+                               u32 root_item_offset;
+                               u64 next_bytenr;
+
+                               root_item_offset = item_offset +
+                                       offsetof(struct btrfs_leaf, items);
+                               if (root_item_offset +
+                                   sizeof(struct btrfs_root_item) >
+                                   sf->block_ctx->len)
+                                       goto leaf_item_out_of_bounce_error;
+                               btrfsic_read_from_block_data(
+                                       sf->block_ctx, &root_item,
+                                       root_item_offset,
+                                       sizeof(struct btrfs_root_item));
+                               next_bytenr = le64_to_cpu(root_item.bytenr);
 
                                sf->error =
                                    btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
                                                &sf->num_copies,
                                                &sf->mirror_num,
                                                disk_key,
-                                               le64_to_cpu(root_item->
+                                               le64_to_cpu(root_item.
                                                generation));
                                if (sf->error)
                                        goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
                                if (NULL != sf->next_block) {
                                        struct btrfs_header *const next_hdr =
                                            (struct btrfs_header *)
-                                           sf->next_block_ctx.data;
+                                           sf->next_block_ctx.datav[0];
 
                                        next_stack =
                                            btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
                }
 
                if (sf->i < sf->nr) {
-                       struct btrfs_key_ptr *disk_key_ptr =
-                           nodehdr->ptrs + sf->i;
-                       const u64 next_bytenr =
-                           le64_to_cpu(disk_key_ptr->blockptr);
+                       struct btrfs_key_ptr key_ptr;
+                       u32 key_ptr_offset;
+                       u64 next_bytenr;
+
+                       key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+                                         (uintptr_t)nodehdr;
+                       if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+                           sf->block_ctx->len) {
+                               printk(KERN_INFO
+                                      "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+                                      sf->block_ctx->start,
+                                      sf->block_ctx->dev->name);
+                               goto one_stack_frame_backwards;
+                       }
+                       btrfsic_read_from_block_data(
+                               sf->block_ctx, &key_ptr, key_ptr_offset,
+                               sizeof(struct btrfs_key_ptr));
+                       next_bytenr = le64_to_cpu(key_ptr.blockptr);
 
                        sf->error = btrfsic_create_link_to_next_block(
                                        state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
                                        force_iodone_flag,
                                        &sf->num_copies,
                                        &sf->mirror_num,
-                                       &disk_key_ptr->key,
-                                       le64_to_cpu(disk_key_ptr->generation));
+                                       &key_ptr.key,
+                                       le64_to_cpu(key_ptr.generation));
                        if (sf->error)
                                goto one_stack_frame_backwards;
 
                        if (NULL != sf->next_block) {
                                struct btrfs_header *const next_hdr =
                                    (struct btrfs_header *)
-                                   sf->next_block_ctx.data;
+                                   sf->next_block_ctx.datav[0];
 
                                next_stack = btrfsic_stack_frame_alloc();
                                if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
        return sf->error;
 }
 
+static void btrfsic_read_from_block_data(
+       struct btrfsic_block_data_ctx *block_ctx,
+       void *dstv, u32 offset, size_t len)
+{
+       size_t cur;
+       size_t offset_in_page;
+       char *kaddr;
+       char *dst = (char *)dstv;
+       size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+       unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+       WARN_ON(offset + len > block_ctx->len);
+       offset_in_page = (start_offset + offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+       while (len > 0) {
+               cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+               BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT);
+               kaddr = block_ctx->datav[i];
+               memcpy(dst, kaddr + offset_in_page, cur);
+
+               dst += cur;
+               len -= cur;
+               offset_in_page = 0;
+               i++;
+       }
+}
+
 static int btrfsic_create_link_to_next_block(
                struct btrfsic_state *state,
                struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
        if (0 == *num_copiesp) {
                *num_copiesp =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
                       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
                       *mirror_nump);
        ret = btrfsic_map_block(state, next_bytenr,
-                               BTRFSIC_BLOCK_SIZE,
+                               state->metablock_size,
                                next_block_ctx, *mirror_nump);
        if (ret) {
                printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
 
        if (limit_nesting > 0 && did_alloc_block_link) {
                ret = btrfsic_read_block(state, next_block_ctx);
-               if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+               if (ret < (int)next_block_ctx->len) {
                        printk(KERN_INFO
                               "btrfsic: read block @logical %llu failed!\n",
                               (unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
                u32 item_offset, int force_iodone_flag)
 {
        int ret;
-       struct btrfs_file_extent_item *file_extent_item =
-           (struct btrfs_file_extent_item *)(block_ctx->data +
-                                             offsetof(struct btrfs_leaf,
-                                                      items) + item_offset);
-       u64 next_bytenr =
-           le64_to_cpu(file_extent_item->disk_bytenr) +
-           le64_to_cpu(file_extent_item->offset);
-       u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
-       u64 generation = le64_to_cpu(file_extent_item->generation);
+       struct btrfs_file_extent_item file_extent_item;
+       u64 file_extent_item_offset;
+       u64 next_bytenr;
+       u64 num_bytes;
+       u64 generation;
        struct btrfsic_block_link *l;
 
+       file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+                                 item_offset;
+       if (file_extent_item_offset +
+           offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+           block_ctx->len) {
+               printk(KERN_INFO
+                      "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+                      block_ctx->start, block_ctx->dev->name);
+               return -1;
+       }
+
+       btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+               file_extent_item_offset,
+               offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+       if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+           ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                       printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+                              file_extent_item.type,
+                              (unsigned long long)
+                              le64_to_cpu(file_extent_item.disk_bytenr));
+               return 0;
+       }
+
+       if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+           block_ctx->len) {
+               printk(KERN_INFO
+                      "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+                      block_ctx->start, block_ctx->dev->name);
+               return -1;
+       }
+       btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+                                    file_extent_item_offset,
+                                    sizeof(struct btrfs_file_extent_item));
+       next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
+                     le64_to_cpu(file_extent_item.offset);
+       generation = le64_to_cpu(file_extent_item.generation);
+       num_bytes = le64_to_cpu(file_extent_item.num_bytes);
+       generation = le64_to_cpu(file_extent_item.generation);
+
        if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
                printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
                       " offset = %llu, num_bytes = %llu\n",
-                      file_extent_item->type,
+                      file_extent_item.type,
                       (unsigned long long)
-                      le64_to_cpu(file_extent_item->disk_bytenr),
-                      (unsigned long long)
-                      le64_to_cpu(file_extent_item->offset),
-                      (unsigned long long)
-                      le64_to_cpu(file_extent_item->num_bytes));
-       if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
-           ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
-               return 0;
+                      le64_to_cpu(file_extent_item.disk_bytenr),
+                      (unsigned long long)le64_to_cpu(file_extent_item.offset),
+                      (unsigned long long)num_bytes);
        while (num_bytes > 0) {
                u32 chunk_len;
                int num_copies;
                int mirror_num;
 
-               if (num_bytes > BTRFSIC_BLOCK_SIZE)
-                       chunk_len = BTRFSIC_BLOCK_SIZE;
+               if (num_bytes > state->datablock_size)
+                       chunk_len = state->datablock_size;
                else
                        chunk_len = num_bytes;
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->datablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        block_ctx_out->dev_bytenr = multi->stripes[0].physical;
        block_ctx_out->start = bytenr;
        block_ctx_out->len = len;
-       block_ctx_out->data = NULL;
-       block_ctx_out->bh = NULL;
+       block_ctx_out->datav = NULL;
+       block_ctx_out->pagev = NULL;
+       block_ctx_out->mem_to_free = NULL;
 
        if (0 == ret)
                kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
        block_ctx_out->dev_bytenr = bytenr;
        block_ctx_out->start = bytenr;
        block_ctx_out->len = len;
-       block_ctx_out->data = NULL;
-       block_ctx_out->bh = NULL;
+       block_ctx_out->datav = NULL;
+       block_ctx_out->pagev = NULL;
+       block_ctx_out->mem_to_free = NULL;
        if (NULL != block_ctx_out->dev) {
                return 0;
        } else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
 
 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
 {
-       if (NULL != block_ctx->bh) {
-               brelse(block_ctx->bh);
-               block_ctx->bh = NULL;
+       if (block_ctx->mem_to_free) {
+               unsigned int num_pages;
+
+               BUG_ON(!block_ctx->datav);
+               BUG_ON(!block_ctx->pagev);
+               num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT;
+               while (num_pages > 0) {
+                       num_pages--;
+                       if (block_ctx->datav[num_pages]) {
+                               kunmap(block_ctx->pagev[num_pages]);
+                               block_ctx->datav[num_pages] = NULL;
+                       }
+                       if (block_ctx->pagev[num_pages]) {
+                               __free_page(block_ctx->pagev[num_pages]);
+                               block_ctx->pagev[num_pages] = NULL;
+                       }
+               }
+
+               kfree(block_ctx->mem_to_free);
+               block_ctx->mem_to_free = NULL;
+               block_ctx->pagev = NULL;
+               block_ctx->datav = NULL;
        }
 }
 
 static int btrfsic_read_block(struct btrfsic_state *state,
                              struct btrfsic_block_data_ctx *block_ctx)
 {
-       block_ctx->bh = NULL;
-       if (block_ctx->dev_bytenr & 4095) {
+       unsigned int num_pages;
+       unsigned int i;
+       u64 dev_bytenr;
+       int ret;
+
+       BUG_ON(block_ctx->datav);
+       BUG_ON(block_ctx->pagev);
+       BUG_ON(block_ctx->mem_to_free);
+       if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
                printk(KERN_INFO
                       "btrfsic: read_block() with unaligned bytenr %llu\n",
                       (unsigned long long)block_ctx->dev_bytenr);
                return -1;
        }
-       if (block_ctx->len > 4096) {
-               printk(KERN_INFO
-                      "btrfsic: read_block() with too huge size %d\n",
-                      block_ctx->len);
+
+       num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+                   PAGE_CACHE_SHIFT;
+       block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+                                         sizeof(*block_ctx->pagev)) *
+                                        num_pages, GFP_NOFS);
+       if (!block_ctx->mem_to_free)
                return -1;
+       block_ctx->datav = block_ctx->mem_to_free;
+       block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+       for (i = 0; i < num_pages; i++) {
+               block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+               if (!block_ctx->pagev[i])
+                       return -1;
        }
 
-       block_ctx->bh = __bread(block_ctx->dev->bdev,
-                               block_ctx->dev_bytenr >> 12, 4096);
-       if (NULL == block_ctx->bh)
-               return -1;
-       block_ctx->data = block_ctx->bh->b_data;
+       dev_bytenr = block_ctx->dev_bytenr;
+       for (i = 0; i < num_pages;) {
+               struct bio *bio;
+               unsigned int j;
+               DECLARE_COMPLETION_ONSTACK(complete);
+
+               bio = bio_alloc(GFP_NOFS, num_pages - i);
+               if (!bio) {
+                       printk(KERN_INFO
+                              "btrfsic: bio_alloc() for %u pages failed!\n",
+                              num_pages - i);
+                       return -1;
+               }
+               bio->bi_bdev = block_ctx->dev->bdev;
+               bio->bi_sector = dev_bytenr >> 9;
+               bio->bi_end_io = btrfsic_complete_bio_end_io;
+               bio->bi_private = &complete;
+
+               for (j = i; j < num_pages; j++) {
+                       ret = bio_add_page(bio, block_ctx->pagev[j],
+                                          PAGE_CACHE_SIZE, 0);
+                       if (PAGE_CACHE_SIZE != ret)
+                               break;
+               }
+               if (j == i) {
+                       printk(KERN_INFO
+                              "btrfsic: error, failed to add a single page!\n");
+                       return -1;
+               }
+               submit_bio(READ, bio);
+
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
+
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                       printk(KERN_INFO
+                              "btrfsic: read error at logical %llu dev %s!\n",
+                              block_ctx->start, block_ctx->dev->name);
+                       bio_put(bio);
+                       return -1;
+               }
+               bio_put(bio);
+               dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+               i = j;
+       }
+       for (i = 0; i < num_pages; i++) {
+               block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+               if (!block_ctx->datav[i]) {
+                       printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+                              block_ctx->dev->name);
+                       return -1;
+               }
+       }
 
        return block_ctx->len;
 }
 
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+{
+       complete((struct completion *)bio->bi_private);
+}
+
 static void btrfsic_dump_database(struct btrfsic_state *state)
 {
        struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
  * (note that this test fails for the super block)
  */
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    const u8 *data, unsigned int size)
+                                    char **datav, unsigned int num_pages)
 {
        struct btrfs_header *h;
        u8 csum[BTRFS_CSUM_SIZE];
        u32 crc = ~(u32)0;
-       int fail = 0;
-       int crc_fail = 0;
+       unsigned int i;
 
-       h = (struct btrfs_header *)data;
+       if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+               return 1; /* not metadata */
+       num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+       h = (struct btrfs_header *)datav[0];
 
        if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
-               fail++;
+               return 1;
+
+       for (i = 0; i < num_pages; i++) {
+               u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+               size_t sublen = i ? PAGE_CACHE_SIZE :
+                                   (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
 
-       crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+               crc = crc32c(crc, data, sublen);
+       }
        btrfs_csum_final(crc, csum);
        if (memcmp(csum, h->csum, state->csum_size))
-               crc_fail++;
+               return 1;
 
-       return fail || crc_fail;
+       return 0; /* is metadata */
 }
 
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr,
-                                         u8 *mapped_data, unsigned int len,
-                                         struct bio *bio,
-                                         int *bio_is_patched,
+                                         u64 dev_bytenr, char **mapped_datav,
+                                         unsigned int num_pages,
+                                         struct bio *bio, int *bio_is_patched,
                                          struct buffer_head *bh,
                                          int submit_bio_bh_rw)
 {
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
        int ret;
        struct btrfsic_state *state = dev_state->state;
        struct block_device *bdev = dev_state->bdev;
+       unsigned int processed_len;
 
-       WARN_ON(len > PAGE_SIZE);
-       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
        if (NULL != bio_is_patched)
                *bio_is_patched = 0;
 
+again:
+       if (num_pages == 0)
+               return;
+
+       processed_len = 0;
+       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+                                                     num_pages));
+
        block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
                                               &state->block_hashtable);
        if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
 
                if (block->is_superblock) {
                        bytenr = le64_to_cpu(((struct btrfs_super_block *)
-                                             mapped_data)->bytenr);
+                                             mapped_datav[0])->bytenr);
+                       if (num_pages * PAGE_CACHE_SIZE <
+                           BTRFS_SUPER_INFO_SIZE) {
+                               printk(KERN_INFO
+                                      "btrfsic: cannot work with too short bios!\n");
+                               return;
+                       }
                        is_metadata = 1;
+                       BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+                       processed_len = BTRFS_SUPER_INFO_SIZE;
                        if (state->print_mask &
                            BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
                                printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                }
                if (is_metadata) {
                        if (!block->is_superblock) {
+                               if (num_pages * PAGE_CACHE_SIZE <
+                                   state->metablock_size) {
+                                       printk(KERN_INFO
+                                              "btrfsic: cannot work with too short bios!\n");
+                                       return;
+                               }
+                               processed_len = state->metablock_size;
                                bytenr = le64_to_cpu(((struct btrfs_header *)
-                                                     mapped_data)->bytenr);
+                                                     mapped_datav[0])->bytenr);
                                btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
                                                               dev_state,
-                                                              dev_bytenr,
-                                                              mapped_data);
+                                                              dev_bytenr);
                        }
                        if (block->logical_bytenr != bytenr) {
                                printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                       block->mirror_num,
                                       btrfsic_get_block_type(state, block));
                } else {
+                       if (num_pages * PAGE_CACHE_SIZE <
+                           state->datablock_size) {
+                               printk(KERN_INFO
+                                      "btrfsic: cannot work with too short bios!\n");
+                               return;
+                       }
+                       processed_len = state->datablock_size;
                        bytenr = block->logical_bytenr;
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                               le64_to_cpu(block->disk_key.offset),
                               (unsigned long long)
                               le64_to_cpu(((struct btrfs_header *)
-                                           mapped_data)->generation),
+                                           mapped_datav[0])->generation),
                               (unsigned long long)
                               state->max_superblock_generation);
                        btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                               (unsigned long long)block->generation,
                               (unsigned long long)
                               le64_to_cpu(((struct btrfs_header *)
-                                           mapped_data)->generation));
+                                           mapped_datav[0])->generation));
                        /* it would not be safe to go on */
                        btrfsic_dump_tree(state);
-                       return;
+                       goto continue_loop;
                }
 
                /*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                }
 
                if (block->is_superblock)
-                       ret = btrfsic_map_superblock(state, bytenr, len,
+                       ret = btrfsic_map_superblock(state, bytenr,
+                                                    processed_len,
                                                     bdev, &block_ctx);
                else
-                       ret = btrfsic_map_block(state, bytenr, len,
+                       ret = btrfsic_map_block(state, bytenr, processed_len,
                                                &block_ctx, 0);
                if (ret) {
                        printk(KERN_INFO
                               "btrfsic: btrfsic_map_block(root @%llu)"
                               " failed!\n", (unsigned long long)bytenr);
-                       return;
+                       goto continue_loop;
                }
-               block_ctx.data = mapped_data;
+               block_ctx.datav = mapped_datav;
                /* the following is required in case of writes to mirrors,
                 * use the same that was used for the lookup */
                block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                        block->logical_bytenr = bytenr;
                        block->is_metadata = 1;
                        if (block->is_superblock) {
+                               BUG_ON(PAGE_CACHE_SIZE !=
+                                      BTRFS_SUPER_INFO_SIZE);
                                ret = btrfsic_process_written_superblock(
                                                state,
                                                block,
                                                (struct btrfs_super_block *)
-                                               mapped_data);
+                                               mapped_datav[0]);
                                if (state->print_mask &
                                    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
                                        printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                                state,
                                                block,
                                                &block_ctx,
-                                               (struct btrfs_header *)
-                                               block_ctx.data,
                                                0, 0);
                        }
                        if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                u64 bytenr;
 
                if (!is_metadata) {
+                       processed_len = state->datablock_size;
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO "Written block (%s/%llu/?)"
                                       " !found in hash table, D.\n",
                                       dev_state->name,
                                       (unsigned long long)dev_bytenr);
-                       if (!state->include_extent_data)
-                               return; /* ignore that written D block */
+                       if (!state->include_extent_data) {
+                               /* ignore that written D block */
+                               goto continue_loop;
+                       }
 
                        /* this is getting ugly for the
                         * include_extent_data case... */
                        bytenr = 0;     /* unknown */
                        block_ctx.start = bytenr;
-                       block_ctx.len = len;
-                       block_ctx.bh = NULL;
+                       block_ctx.len = processed_len;
+                       block_ctx.mem_to_free = NULL;
+                       block_ctx.pagev = NULL;
                } else {
+                       processed_len = state->metablock_size;
                        bytenr = le64_to_cpu(((struct btrfs_header *)
-                                             mapped_data)->bytenr);
+                                             mapped_datav[0])->bytenr);
                        btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
-                                                      dev_bytenr,
-                                                      mapped_data);
+                                                      dev_bytenr);
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO
                                       "Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                       dev_state->name,
                                       (unsigned long long)dev_bytenr);
 
-                       ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
-                                               0);
+                       ret = btrfsic_map_block(state, bytenr, processed_len,
+                                               &block_ctx, 0);
                        if (ret) {
                                printk(KERN_INFO
                                       "btrfsic: btrfsic_map_block(root @%llu)"
                                       " failed!\n",
                                       (unsigned long long)dev_bytenr);
-                               return;
+                               goto continue_loop;
                        }
                }
-               block_ctx.data = mapped_data;
+               block_ctx.datav = mapped_datav;
                /* the following is required in case of writes to mirrors,
                 * use the same that was used for the lookup */
                block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                if (NULL == block) {
                        printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
                        btrfsic_release_block_ctx(&block_ctx);
-                       return;
+                       goto continue_loop;
                }
                block->dev_state = dev_state;
                block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
 
                if (is_metadata) {
                        ret = btrfsic_process_metablock(state, block,
-                                                       &block_ctx,
-                                                       (struct btrfs_header *)
-                                                       block_ctx.data, 0, 0);
+                                                       &block_ctx, 0, 0);
                        if (ret)
                                printk(KERN_INFO
                                       "btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                }
                btrfsic_release_block_ctx(&block_ctx);
        }
+
+continue_loop:
+       BUG_ON(!processed_len);
+       dev_bytenr += processed_len;
+       mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+       num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+       goto again;
 }
 
 static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, BTRFS_SUPER_INFO_SIZE);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
                                printk(KERN_INFO
                                       "btrfsic_process_written_superblock("
                                       "mirror_num=%d)\n", mirror_num);
-                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               BTRFS_SUPER_INFO_SIZE,
                                                &tmp_next_block_ctx,
                                                mirror_num);
                        if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
 static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                                           u64 bytenr,
                                           struct btrfsic_dev_state *dev_state,
-                                          u64 dev_bytenr, char *data)
+                                          u64 dev_bytenr)
 {
        int num_copies;
        int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
        int match = 0;
 
        num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                     bytenr, PAGE_SIZE);
+                                     bytenr, state->metablock_size);
 
        for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-               ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+               ret = btrfsic_map_block(state, bytenr, state->metablock_size,
                                        &block_ctx, mirror_num);
                if (ret) {
                        printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                       (unsigned long long)bytenr, dev_state->name,
                       (unsigned long long)dev_bytenr);
                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-                       ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, bytenr,
+                                               state->metablock_size,
                                                &block_ctx, mirror_num);
                        if (ret)
                                continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
                               (unsigned long)bh->b_size, bh->b_data,
                               bh->b_bdev);
                btrfsic_process_written_block(dev_state, dev_bytenr,
-                                             bh->b_data, bh->b_size, NULL,
+                                             &bh->b_data, 1, NULL,
                                              NULL, bh, rw);
        } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
                if (dev_state->state->print_mask &
                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
                        printk(KERN_INFO
-                              "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                              "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
                               rw, bh->b_bdev);
                if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
                        if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                unsigned int i;
                u64 dev_bytenr;
                int bio_is_patched;
+               char **mapped_datav;
 
                dev_bytenr = 512 * bio->bi_sector;
                bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                               (unsigned long long)dev_bytenr,
                               bio->bi_bdev);
 
+               mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+                                      GFP_NOFS);
+               if (!mapped_datav)
+                       goto leave;
                for (i = 0; i < bio->bi_vcnt; i++) {
-                       u8 *mapped_data;
-
-                       mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                       BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+                       mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+                       if (!mapped_datav[i]) {
+                               while (i > 0) {
+                                       i--;
+                                       kunmap(bio->bi_io_vec[i].bv_page);
+                               }
+                               kfree(mapped_datav);
+                               goto leave;
+                       }
                        if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                             BTRFSIC_PRINT_MASK_VERBOSE) ==
                            (dev_state->state->print_mask &
                             (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                              BTRFSIC_PRINT_MASK_VERBOSE)))
                                printk(KERN_INFO
-                                      "#%u: page=%p, mapped=%p, len=%u,"
-                                      " offset=%u\n",
+                                      "#%u: page=%p, len=%u, offset=%u\n",
                                       i, bio->bi_io_vec[i].bv_page,
-                                      mapped_data,
                                       bio->bi_io_vec[i].bv_len,
                                       bio->bi_io_vec[i].bv_offset);
-                       btrfsic_process_written_block(dev_state, dev_bytenr,
-                                                     mapped_data,
-                                                     bio->bi_io_vec[i].bv_len,
-                                                     bio, &bio_is_patched,
-                                                     NULL, rw);
+               }
+               btrfsic_process_written_block(dev_state, dev_bytenr,
+                                             mapped_datav, bio->bi_vcnt,
+                                             bio, &bio_is_patched,
+                                             NULL, rw);
+               while (i > 0) {
+                       i--;
                        kunmap(bio->bi_io_vec[i].bv_page);
-                       dev_bytenr += bio->bi_io_vec[i].bv_len;
                }
+               kfree(mapped_datav);
        } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
                if (dev_state->state->print_mask &
                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
                        printk(KERN_INFO
-                              "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                              "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
                               rw, bio->bi_bdev);
                if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
                        if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                        bio->bi_end_io = btrfsic_bio_end_io;
                }
        }
+leave:
        mutex_unlock(&btrfsic_mutex);
 
        submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
        struct list_head *dev_head = &fs_devices->devices;
        struct btrfs_device *device;
 
+       if (root->nodesize != root->leafsize) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+                      root->nodesize, root->leafsize);
+               return -1;
+       }
+       if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
+       if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
+       if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
        state = kzalloc(sizeof(*state), GFP_NOFS);
        if (NULL == state) {
                printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
        state->print_mask = print_mask;
        state->include_extent_data = including_extent_data;
        state->csum_size = 0;
+       state->metablock_size = root->nodesize;
+       state->datablock_size = root->sectorsize;
        INIT_LIST_HEAD(&state->all_blocks_list);
        btrfsic_block_hashtable_init(&state->block_hashtable);
        btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
                                btrfsic_block_link_free(l);
                }
 
-               if (b_all->is_iodone)
+               if (b_all->is_iodone || b_all->never_written)
                        btrfsic_block_free(b_all);
                else
                        printk(KERN_INFO "btrfs: attempt to free %c-block"
index 4106264..d7a96cf 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *dst_buf,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct btrfs_path *path, int level, int slot);
+                   struct btrfs_path *path, int level, int slot,
+                   int tree_mod_log);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                                struct extent_buffer *eb);
+struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
+                                         u32 blocksize, u64 parent_transid,
+                                         u64 time_seq);
+struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
+                                               u64 bytenr, u32 blocksize,
+                                               u64 time_seq);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                     new_root_objectid, &disk_key, level,
-                                    buf->start, 0, 1);
+                                    buf->start, 0);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+enum mod_log_op {
+       MOD_LOG_KEY_REPLACE,
+       MOD_LOG_KEY_ADD,
+       MOD_LOG_KEY_REMOVE,
+       MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+       MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+       MOD_LOG_MOVE_KEYS,
+       MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+       int dst_slot;
+       int nr_items;
+};
+
+struct tree_mod_root {
+       u64 logical;
+       u8 level;
+};
+
+struct tree_mod_elem {
+       struct rb_node node;
+       u64 index;              /* shifted logical */
+       struct seq_list elem;
+       enum mod_log_op op;
+
+       /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+       int slot;
+
+       /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+       u64 generation;
+
+       /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+       struct btrfs_disk_key key;
+       u64 blockptr;
+
+       /* this is used for op == MOD_LOG_MOVE_KEYS */
+       struct tree_mod_move move;
+
+       /* this is used for op == MOD_LOG_ROOT_REPLACE */
+       struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+       elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+       list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem)
+{
+       elem->flags = 1;
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       __get_tree_mod_seq(fs_info, elem);
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem)
+{
+       struct rb_root *tm_root;
+       struct rb_node *node;
+       struct rb_node *next;
+       struct seq_list *cur_elem;
+       struct tree_mod_elem *tm;
+       u64 min_seq = (u64)-1;
+       u64 seq_putting = elem->seq;
+
+       if (!seq_putting)
+               return;
+
+       BUG_ON(!(elem->flags & 1));
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       list_del(&elem->list);
+
+       list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+               if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+                       if (seq_putting > cur_elem->seq) {
+                               /*
+                                * blocker with lower sequence number exists, we
+                                * cannot remove anything from the log
+                                */
+                               goto out;
+                       }
+                       min_seq = cur_elem->seq;
+               }
+       }
+
+       /*
+        * anything that's lower than the lowest existing (read: blocked)
+        * sequence number can be removed from the tree.
+        */
+       write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       for (node = rb_first(tm_root); node; node = next) {
+               next = rb_next(node);
+               tm = container_of(node, struct tree_mod_elem, node);
+               if (tm->elem.seq > min_seq)
+                       continue;
+               rb_erase(node, tm_root);
+               list_del(&tm->elem.list);
+               kfree(tm);
+       }
+       write_unlock(&fs_info->tree_mod_log_lock);
+out:
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+       struct rb_root *tm_root;
+       struct rb_node **new;
+       struct rb_node *parent = NULL;
+       struct tree_mod_elem *cur;
+       int ret = 0;
+
+       BUG_ON(!tm || !tm->elem.seq);
+
+       write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       new = &tm_root->rb_node;
+       while (*new) {
+               cur = container_of(*new, struct tree_mod_elem, node);
+               parent = *new;
+               if (cur->index < tm->index)
+                       new = &((*new)->rb_left);
+               else if (cur->index > tm->index)
+                       new = &((*new)->rb_right);
+               else if (cur->elem.seq < tm->elem.seq)
+                       new = &((*new)->rb_left);
+               else if (cur->elem.seq > tm->elem.seq)
+                       new = &((*new)->rb_right);
+               else {
+                       kfree(tm);
+                       ret = -EEXIST;
+                       goto unlock;
+               }
+       }
+
+       rb_link_node(&tm->node, parent, new);
+       rb_insert_color(&tm->node, tm_root);
+unlock:
+       write_unlock(&fs_info->tree_mod_log_lock);
+       return ret;
+}
+
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+                                   struct extent_buffer *eb) {
+       smp_mb();
+       if (list_empty(&(fs_info)->tree_mod_seq_list))
+               return 1;
+       if (!eb)
+               return 0;
+       if (btrfs_header_level(eb) == 0)
+               return 1;
+       return 0;
+}
+
+static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+                                struct tree_mod_elem **tm_ret)
+{
+       struct tree_mod_elem *tm;
+       int seq;
+
+       if (tree_mod_dont_log(fs_info, NULL))
+               return 0;
+
+       tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+       if (!tm)
+               return -ENOMEM;
+
+       tm->elem.flags = 0;
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       if (list_empty(&fs_info->tree_mod_seq_list)) {
+               /*
+                * someone emptied the list while we were waiting for the lock.
+                * we must not add to the list, because no blocker exists. items
+                * are removed from the list only when the existing blocker is
+                * removed from the list.
+                */
+               kfree(tm);
+               seq = 0;
+       } else {
+               __get_tree_mod_seq(fs_info, &tm->elem);
+               seq = tm->elem.seq;
+       }
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+
+       return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+                            struct extent_buffer *eb, int slot,
+                            enum mod_log_op op, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = eb->start >> PAGE_CACHE_SHIFT;
+       if (op != MOD_LOG_KEY_ADD) {
+               btrfs_node_key(eb, &tm->key, slot);
+               tm->blockptr = btrfs_node_blockptr(eb, slot);
+       }
+       tm->op = op;
+       tm->slot = slot;
+       tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                       int slot, enum mod_log_op op)
+{
+       return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+                        struct extent_buffer *eb, int dst_slot, int src_slot,
+                        int nr_items, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+       int i;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return 0;
+
+       for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+               ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+                                             MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+               BUG_ON(ret < 0);
+       }
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = eb->start >> PAGE_CACHE_SHIFT;
+       tm->slot = src_slot;
+       tm->move.dst_slot = dst_slot;
+       tm->move.nr_items = nr_items;
+       tm->op = MOD_LOG_MOVE_KEYS;
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+                        struct extent_buffer *old_root,
+                        struct extent_buffer *new_root, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+       tm->old_root.logical = old_root->start;
+       tm->old_root.level = btrfs_header_level(old_root);
+       tm->generation = btrfs_header_generation(old_root);
+       tm->op = MOD_LOG_ROOT_REPLACE;
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+                     int smallest)
+{
+       struct rb_root *tm_root;
+       struct rb_node *node;
+       struct tree_mod_elem *cur = NULL;
+       struct tree_mod_elem *found = NULL;
+       u64 index = start >> PAGE_CACHE_SHIFT;
+
+       read_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       node = tm_root->rb_node;
+       while (node) {
+               cur = container_of(node, struct tree_mod_elem, node);
+               if (cur->index < index) {
+                       node = node->rb_left;
+               } else if (cur->index > index) {
+                       node = node->rb_right;
+               } else if (cur->elem.seq < min_seq) {
+                       node = node->rb_left;
+               } else if (!smallest) {
+                       /* we want the node with the highest seq */
+                       if (found)
+                               BUG_ON(found->elem.seq > cur->elem.seq);
+                       found = cur;
+                       node = node->rb_left;
+               } else if (cur->elem.seq > min_seq) {
+                       /* we want the node with the smallest seq */
+                       if (found)
+                               BUG_ON(found->elem.seq < cur->elem.seq);
+                       found = cur;
+                       node = node->rb_right;
+               } else {
+                       found = cur;
+                       break;
+               }
+       }
+       read_unlock(&fs_info->tree_mod_log_lock);
+
+       return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+                          u64 min_seq)
+{
+       return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+       return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+                    struct extent_buffer *src, unsigned long dst_offset,
+                    unsigned long src_offset, int nr_items)
+{
+       int ret;
+       int i;
+
+       if (tree_mod_dont_log(fs_info, NULL))
+               return;
+
+       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+               return;
+
+       /* speed this up by single seq for all operations? */
+       for (i = 0; i < nr_items; i++) {
+               ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
+               ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+                                             MOD_LOG_KEY_ADD);
+               BUG_ON(ret < 0);
+       }
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+                    int dst_offset, int src_offset, int nr_items)
+{
+       int ret;
+       ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+                                      nr_items, GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
+static inline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+                         struct extent_buffer *eb,
+                         struct btrfs_disk_key *disk_key, int slot, int atomic)
+{
+       int ret;
+
+       ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
+                                          MOD_LOG_KEY_REPLACE,
+                                          atomic ? GFP_ATOMIC : GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                                struct extent_buffer *eb)
+{
+       int i;
+       int ret;
+       u32 nritems;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return;
+
+       nritems = btrfs_header_nritems(eb);
+       for (i = nritems - 1; i >= 0; i--) {
+               ret = tree_mod_log_insert_key(fs_info, eb, i,
+                                             MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+               BUG_ON(ret < 0);
+       }
+}
+
+static inline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+                             struct extent_buffer *new_root_node)
+{
+       int ret;
+       tree_mod_log_free_eb(root->fs_info, root->node);
+       ret = tree_mod_log_insert_root(root->fs_info, root->node,
+                                      new_root_node, GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
 /*
  * check if the tree block can be shared by multiple trees
  */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret); /* -ENOMEM */
                }
+               /*
+                * don't log freeing in case we're freeing the root node, this
+                * is done by tree_mod_log_set_root_pointer later
+                */
+               if (buf != root->node && btrfs_header_level(buf) != 0)
+                       tree_mod_log_free_eb(root->fs_info, buf);
                clean_tree_block(trans, root, buf);
                *last_ref = 1;
        }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                     root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size, 1);
+                                    level, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                        parent_start = 0;
 
                extent_buffer_get(cow);
+               tree_mod_log_set_root_pointer(root, cow);
                rcu_assign_pointer(root->node, cow);
 
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref, 1);
+                                     last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                        parent_start = 0;
 
                WARN_ON(trans->transid != btrfs_header_generation(parent));
+               tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+                                       MOD_LOG_KEY_REPLACE);
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref, 1);
+                                     last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+                          struct btrfs_root *root, u64 time_seq)
+{
+       struct tree_mod_elem *tm;
+       struct tree_mod_elem *found = NULL;
+       u64 root_logical = root->node->start;
+       int looped = 0;
+
+       if (!time_seq)
+               return 0;
+
+       /*
+        * the very last operation that's logged for a root is the replacement
+        * operation (if it is replaced at all). this has the index of the *new*
+        * root, making it the very first operation that's logged for this root.
+        */
+       while (1) {
+               tm = tree_mod_log_search_oldest(fs_info, root_logical,
+                                               time_seq);
+               if (!looped && !tm)
+                       return 0;
+               /*
+                * we must have key remove operations in the log before the
+                * replace operation.
+                */
+               BUG_ON(!tm);
+
+               if (tm->op != MOD_LOG_ROOT_REPLACE)
+                       break;
+
+               found = tm;
+               root_logical = tm->old_root.logical;
+               BUG_ON(root_logical == root->node->start);
+               looped = 1;
+       }
+
+       return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+                     struct tree_mod_elem *first_tm)
+{
+       u32 n;
+       struct rb_node *next;
+       struct tree_mod_elem *tm = first_tm;
+       unsigned long o_dst;
+       unsigned long o_src;
+       unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+       n = btrfs_header_nritems(eb);
+       while (tm && tm->elem.seq >= time_seq) {
+               /*
+                * all the operations are recorded with the operator used for
+                * the modification. as we're going backwards, we do the
+                * opposite of each operation here.
+                */
+               switch (tm->op) {
+               case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+                       BUG_ON(tm->slot < n);
+               case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+               case MOD_LOG_KEY_REMOVE:
+                       btrfs_set_node_key(eb, &tm->key, tm->slot);
+                       btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+                       btrfs_set_node_ptr_generation(eb, tm->slot,
+                                                     tm->generation);
+                       n++;
+                       break;
+               case MOD_LOG_KEY_REPLACE:
+                       BUG_ON(tm->slot >= n);
+                       btrfs_set_node_key(eb, &tm->key, tm->slot);
+                       btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+                       btrfs_set_node_ptr_generation(eb, tm->slot,
+                                                     tm->generation);
+                       break;
+               case MOD_LOG_KEY_ADD:
+                       if (tm->slot != n - 1) {
+                               o_dst = btrfs_node_key_ptr_offset(tm->slot);
+                               o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
+                               memmove_extent_buffer(eb, o_dst, o_src, p_size);
+                       }
+                       n--;
+                       break;
+               case MOD_LOG_MOVE_KEYS:
+                       o_dst = btrfs_node_key_ptr_offset(tm->slot);
+                       o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+                       memmove_extent_buffer(eb, o_dst, o_src,
+                                             tm->move.nr_items * p_size);
+                       break;
+               case MOD_LOG_ROOT_REPLACE:
+                       /*
+                        * this operation is special. for roots, this must be
+                        * handled explicitly before rewinding.
+                        * for non-roots, this operation may exist if the node
+                        * was a root: root A -> child B; then A gets empty and
+                        * B is promoted to the new root. in the mod log, we'll
+                        * have a root-replace operation for B, a tree block
+                        * that is no root. we simply ignore that operation.
+                        */
+                       break;
+               }
+               next = rb_next(&tm->node);
+               if (!next)
+                       break;
+               tm = container_of(next, struct tree_mod_elem, node);
+               if (tm->index != first_tm->index)
+                       break;
+       }
+       btrfs_set_header_nritems(eb, n);
+}
+
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                   u64 time_seq)
+{
+       struct extent_buffer *eb_rewin;
+       struct tree_mod_elem *tm;
+
+       if (!time_seq)
+               return eb;
+
+       if (btrfs_header_level(eb) == 0)
+               return eb;
+
+       tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+       if (!tm)
+               return eb;
+
+       if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+               BUG_ON(tm->slot != 0);
+               eb_rewin = alloc_dummy_extent_buffer(eb->start,
+                                               fs_info->tree_root->nodesize);
+               BUG_ON(!eb_rewin);
+               btrfs_set_header_bytenr(eb_rewin, eb->start);
+               btrfs_set_header_backref_rev(eb_rewin,
+                                            btrfs_header_backref_rev(eb));
+               btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+               btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+       } else {
+               eb_rewin = btrfs_clone_extent_buffer(eb);
+               BUG_ON(!eb_rewin);
+       }
+
+       extent_buffer_get(eb_rewin);
+       free_extent_buffer(eb);
+
+       __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+
+       return eb_rewin;
+}
+
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+       struct tree_mod_elem *tm;
+       struct extent_buffer *eb;
+       struct tree_mod_root *old_root;
+       u64 old_generation;
+
+       tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+       if (!tm)
+               return root->node;
+
+       old_root = &tm->old_root;
+       old_generation = tm->generation;
+
+       tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
+       /*
+        * there was an item in the log when __tree_mod_log_oldest_root
+        * returned. this one must not go away, because the time_seq passed to
+        * us must be blocking its removal.
+        */
+       BUG_ON(!tm);
+
+       if (old_root->logical == root->node->start) {
+               /* there are logged operations for the current root */
+               eb = btrfs_clone_extent_buffer(root->node);
+       } else {
+               /* there's a root replace operation for the current root */
+               eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
+                                              root->nodesize);
+               btrfs_set_header_bytenr(eb, eb->start);
+               btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+               btrfs_set_header_owner(eb, root->root_key.objectid);
+       }
+       if (!eb)
+               return NULL;
+       btrfs_set_header_level(eb, old_root->level);
+       btrfs_set_header_generation(eb, old_generation);
+       __tree_mod_log_rewind(eb, time_seq, tm);
+
+       return eb;
+}
+
 static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                                if (!cur)
                                        return -EIO;
                        } else if (!uptodate) {
-                               btrfs_read_buffer(cur, gen);
+                               err = btrfs_read_buffer(cur, gen);
+                               if (err) {
+                                       free_extent_buffer(cur);
+                                       return err;
+                               }
                        }
                }
                if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot)
 {
-       if (level == 0) {
+       if (level == 0)
                return generic_bin_search(eb,
                                          offsetof(struct btrfs_leaf, items),
                                          sizeof(struct btrfs_item),
                                          key, btrfs_header_nritems(eb),
                                          slot);
-       } else {
+       else
                return generic_bin_search(eb,
                                          offsetof(struct btrfs_node, ptrs),
                                          sizeof(struct btrfs_key_ptr),
                                          key, btrfs_header_nritems(eb),
                                          slot);
-       }
-       return -1;
 }
 
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        goto enospc;
                }
 
+               tree_mod_log_set_root_pointer(root, child);
                rcu_assign_pointer(root->node, child);
 
                add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                free_extent_buffer(mid);
 
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+               btrfs_free_tree_block(trans, root, mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer_stale(mid);
                return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (btrfs_header_nritems(right) == 0) {
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                       del_ptr(trans, root, path, level + 1, pslot + 1);
+                       del_ptr(trans, root, path, level + 1, pslot + 1, 1);
                        root_sub_used(root, right->len);
-                       btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+                       btrfs_free_tree_block(trans, root, right, 0, 1);
                        free_extent_buffer_stale(right);
                        right = NULL;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &right_key, pslot + 1, 0);
                        btrfs_set_node_key(parent, &right_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
                }
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        if (btrfs_header_nritems(mid) == 0) {
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-               del_ptr(trans, root, path, level + 1, pslot);
+               del_ptr(trans, root, path, level + 1, pslot, 1);
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+               btrfs_free_tree_block(trans, root, mid, 0, 1);
                free_extent_buffer_stale(mid);
                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
                btrfs_node_key(mid, &mid_key, 0);
+               tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+                                         pslot, 0);
                btrfs_set_node_key(parent, &mid_key, pslot);
                btrfs_mark_buffer_dirty(parent);
        }
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key disk_key;
                        orig_slot += left_nr;
                        btrfs_node_key(mid, &disk_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &disk_key, pslot, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot);
                        btrfs_mark_buffer_dirty(parent);
                        if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key disk_key;
 
                        btrfs_node_key(right, &disk_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &disk_key, pslot + 1, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
 
@@ -1496,7 +2158,7 @@ static int
 read_block_for_search(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_path *p,
                       struct extent_buffer **eb_ret, int level, int slot,
-                      struct btrfs_key *key)
+                      struct btrfs_key *key, u64 time_seq)
 {
        u64 blocknr;
        u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
                        }
 
                        err = read_block_for_search(trans, root, p,
-                                                   &b, level, slot, key);
+                                                   &b, level, slot, key, 0);
                        if (err == -EAGAIN)
                                goto again;
                        if (err) {
@@ -1922,6 +2584,115 @@ done:
 }
 
 /*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+                         struct btrfs_path *p, u64 time_seq)
+{
+       struct extent_buffer *b;
+       int slot;
+       int ret;
+       int err;
+       int level;
+       int lowest_unlock = 1;
+       u8 lowest_level = 0;
+
+       lowest_level = p->lowest_level;
+       WARN_ON(p->nodes[0] != NULL);
+
+       if (p->search_commit_root) {
+               BUG_ON(time_seq);
+               return btrfs_search_slot(NULL, root, key, p, 0, 0);
+       }
+
+again:
+       b = get_old_root(root, time_seq);
+       extent_buffer_get(b);
+       level = btrfs_header_level(b);
+       btrfs_tree_read_lock(b);
+       p->locks[level] = BTRFS_READ_LOCK;
+
+       while (b) {
+               level = btrfs_header_level(b);
+               p->nodes[level] = b;
+               btrfs_clear_path_blocking(p, NULL, 0);
+
+               /*
+                * we have a lock on b and as long as we aren't changing
+                * the tree, there is no way to for the items in b to change.
+                * It is safe to drop the lock on our parent before we
+                * go through the expensive btree search on b.
+                */
+               btrfs_unlock_up_safe(p, level + 1);
+
+               ret = bin_search(b, key, level, &slot);
+
+               if (level != 0) {
+                       int dec = 0;
+                       if (ret && slot > 0) {
+                               dec = 1;
+                               slot -= 1;
+                       }
+                       p->slots[level] = slot;
+                       unlock_up(p, level, lowest_unlock, 0, NULL);
+
+                       if (level == lowest_level) {
+                               if (dec)
+                                       p->slots[level]++;
+                               goto done;
+                       }
+
+                       err = read_block_for_search(NULL, root, p, &b, level,
+                                                   slot, key, time_seq);
+                       if (err == -EAGAIN)
+                               goto again;
+                       if (err) {
+                               ret = err;
+                               goto done;
+                       }
+
+                       level = btrfs_header_level(b);
+                       err = btrfs_try_tree_read_lock(b);
+                       if (!err) {
+                               btrfs_set_path_blocking(p);
+                               btrfs_tree_read_lock(b);
+                               btrfs_clear_path_blocking(p, b,
+                                                         BTRFS_READ_LOCK);
+                       }
+                       p->locks[level] = BTRFS_READ_LOCK;
+                       p->nodes[level] = b;
+                       b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+                       if (b != p->nodes[level]) {
+                               btrfs_tree_unlock_rw(p->nodes[level],
+                                                    p->locks[level]);
+                               p->locks[level] = 0;
+                               p->nodes[level] = b;
+                       }
+               } else {
+                       p->slots[level] = slot;
+                       unlock_up(p, level, lowest_unlock, 0, NULL);
+                       goto done;
+               }
+       }
+       ret = 1;
+done:
+       if (!p->leave_spinning)
+               btrfs_set_path_blocking(p);
+       if (ret < 0)
+               btrfs_release_path(p);
+
+       return ret;
+}
+
+/*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
  * This is used after shifting pointers to the left, so it stops
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
+               tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        } else
                push_items = min(src_nritems - 8, push_items);
 
+       tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+                            push_items);
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(dst_nritems),
                           btrfs_node_key_ptr_offset(0),
                           push_items * sizeof(struct btrfs_key_ptr));
 
        if (push_items < src_nritems) {
+               tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
+                                    src_nritems - push_items);
                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
                                      btrfs_node_key_ptr_offset(push_items),
                                      (src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
        if (max_push < push_items)
                push_items = max_push;
 
+       tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
                                      btrfs_node_key_ptr_offset(0),
                                      (dst_nritems) *
                                      sizeof(struct btrfs_key_ptr));
 
+       tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+                            src_nritems - push_items, push_items);
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                   root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0, 0);
+                                  level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
 
        old = root->node;
+       tree_mod_log_set_root_pointer(root, c);
        rcu_assign_pointer(root->node, c);
 
        /* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 static void insert_ptr(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_path *path,
                       struct btrfs_disk_key *key, u64 bytenr,
-                      int slot, int level)
+                      int slot, int level, int tree_mod_log)
 {
        struct extent_buffer *lower;
        int nritems;
+       int ret;
 
        BUG_ON(!path->nodes[level]);
        btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
        BUG_ON(slot > nritems);
        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
        if (slot != nritems) {
+               if (tree_mod_log && level)
+                       tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+                                            slot, nritems - slot);
                memmove_extent_buffer(lower,
                              btrfs_node_key_ptr_offset(slot + 1),
                              btrfs_node_key_ptr_offset(slot),
                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
        }
+       if (tree_mod_log && level) {
+               ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+                                             MOD_LOG_KEY_ADD);
+               BUG_ON(ret < 0);
+       }
        btrfs_set_node_key(lower, key, slot);
        btrfs_set_node_blockptr(lower, slot, bytenr);
        WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, level, c->start, 0, 0);
+                                       &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
 
-
+       tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(split);
 
        insert_ptr(trans, root, path, &disk_key, split->start,
-                  path->slots[level + 1] + 1, level + 1);
+                  path->slots[level + 1] + 1, level + 1, 1);
 
        if (path->slots[level] >= mid) {
                path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(l, mid);
        btrfs_item_key(right, &disk_key, 0);
        insert_ptr(trans, root, path, &disk_key, right->start,
-                  path->slots[1] + 1, 1);
+                  path->slots[1] + 1, 1, 0);
 
        btrfs_mark_buffer_dirty(right);
        btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
 
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0, 0);
+                                       &disk_key, 0, l->start, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
@@ -3028,7 +3817,7 @@ again:
                if (mid <= slot) {
                        btrfs_set_header_nritems(right, 0);
                        insert_ptr(trans, root, path, &disk_key, right->start,
-                                  path->slots[1] + 1, 1);
+                                  path->slots[1] + 1, 1, 0);
                        btrfs_tree_unlock(path->nodes[0]);
                        free_extent_buffer(path->nodes[0]);
                        path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
                } else {
                        btrfs_set_header_nritems(right, 0);
                        insert_ptr(trans, root, path, &disk_key, right->start,
-                                         path->slots[1], 1);
+                                         path->slots[1], 1, 0);
                        btrfs_tree_unlock(path->nodes[0]);
                        free_extent_buffer(path->nodes[0]);
                        path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
  * empty a node.
  */
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                   struct btrfs_path *path, int level, int slot)
+                   struct btrfs_path *path, int level, int slot,
+                   int tree_mod_log)
 {
        struct extent_buffer *parent = path->nodes[level];
        u32 nritems;
+       int ret;
 
        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
+               if (tree_mod_log && level)
+                       tree_mod_log_eb_move(root->fs_info, parent, slot,
+                                            slot + 1, nritems - slot - 1);
                memmove_extent_buffer(parent,
                              btrfs_node_key_ptr_offset(slot),
                              btrfs_node_key_ptr_offset(slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
+       } else if (tree_mod_log && level) {
+               ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
        }
+
        nritems--;
        btrfs_set_header_nritems(parent, nritems);
        if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                    struct extent_buffer *leaf)
 {
        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-       del_ptr(trans, root, path, 1, path->slots[1]);
+       del_ptr(trans, root, path, 1, path->slots[1], 1);
 
        /*
         * btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
        root_sub_used(root, leaf->len);
 
        extent_buffer_get(leaf);
-       btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+       btrfs_free_tree_block(trans, root, leaf, 0, 1);
        free_extent_buffer_stale(leaf);
 }
 /*
@@ -4271,7 +5070,7 @@ again:
                next = c;
                next_rw_lock = path->locks[level];
                ret = read_block_for_search(NULL, root, path, &next, level,
-                                           slot, &key);
+                                           slot, &key, 0);
                if (ret == -EAGAIN)
                        goto again;
 
@@ -4308,7 +5107,7 @@ again:
                        break;
 
                ret = read_block_for_search(NULL, root, path, &next, level,
-                                           0, &key);
+                                           0, &key, 0);
                if (ret == -EAGAIN)
                        goto again;
 
index 8fd7233..0236d03 100644 (file)
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_XATTR         8
 #define BTRFS_FT_MAX           9
 
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
        u8 csum;
 } __attribute__ ((__packed__));
 
+struct btrfs_dev_stats_item {
+       /*
+        * grow this item struct at the end for future enhancements and keep
+        * the existing values unchanged
+        */
+       __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
        spinlock_t delayed_iput_lock;
        struct list_head delayed_iputs;
 
+       /* this protects tree_mod_seq_list */
+       spinlock_t tree_mod_seq_lock;
+       atomic_t tree_mod_seq;
+       struct list_head tree_mod_seq_list;
+
+       /* this protects tree_mod_log */
+       rwlock_t tree_mod_log_lock;
+       struct rb_root tree_mod_log;
+
        atomic_t nr_async_submits;
        atomic_t async_submit_draining;
        atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
        struct list_head root_list;
 
        spinlock_t orphan_lock;
-       struct list_head orphan_list;
+       atomic_t orphan_inodes;
        struct btrfs_block_rsv *orphan_block_rsv;
        int orphan_item_inserted;
        int orphan_cleanup_state;
@@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_BALANCE_ITEM_KEY 248
 
 /*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY    249
+
+/*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
        return btrfs_item_size(eb, e) - offset;
 }
 
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+                                       struct btrfs_dev_stats_item *ptr,
+                                       int index)
+{
+       u64 val;
+
+       read_extent_buffer(eb, &val,
+                          offsetof(struct btrfs_dev_stats_item, values) +
+                           ((unsigned long)ptr) + (index * sizeof(u64)),
+                          sizeof(val));
+       return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+                                            struct btrfs_dev_stats_item *ptr,
+                                            int index, u64 val)
+{
+       write_extent_buffer(eb, &val,
+                           offsetof(struct btrfs_dev_stats_item, values) +
+                            ((unsigned long)ptr) + (index * sizeof(u64)),
+                           sizeof(val));
+}
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size, int for_cow);
+                                       u64 hint, u64 empty_size);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref, int for_cow);
+                          u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_path *p, int
                      ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+                         struct btrfs_path *p, u64 time_seq);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
                       int start_slot, int cache_only, u64 *last_ret,
@@ -2922,7 +2974,6 @@ int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
@@ -3098,4 +3149,23 @@ void btrfs_reada_detach(void *handle);
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                         u64 start, int err);
 
+/* delayed seq elem */
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+       u32 flags;
+};
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+
+static inline int is_fstree(u64 rootid)
+{
+       if (rootid == BTRFS_FS_TREE_OBJECTID ||
+           (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+       return 0;
+}
 #endif
index 03e3748..c18d044 100644 (file)
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
                return ret;
        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                spin_lock(&BTRFS_I(inode)->lock);
-               if (BTRFS_I(inode)->delalloc_meta_reserved) {
-                       BTRFS_I(inode)->delalloc_meta_reserved = 0;
+               if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                                      &BTRFS_I(inode)->runtime_flags)) {
                        spin_unlock(&BTRFS_I(inode)->lock);
                        release = true;
                        goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
        btrfs_set_stack_inode_generation(inode_item,
                                         BTRFS_I(inode)->generation);
-       btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+       btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
        btrfs_set_stack_inode_transid(inode_item, trans->transid);
        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
        set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
        inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
-       BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+       inode->i_version = btrfs_stack_inode_sequence(inode_item);
        inode->i_rdev = 0;
        *rdev = btrfs_stack_inode_rdev(inode_item);
        BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
index 69f22e3..13ae7b0 100644 (file)
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
 
-       if (need_ref_seq(for_cow, ref_root))
+       if (is_fstree(ref_root))
                seq = inc_delayed_seq(delayed_refs);
        ref->seq = seq;
 
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
 
-       if (need_ref_seq(for_cow, ref_root))
+       if (is_fstree(ref_root))
                seq = inc_delayed_seq(delayed_refs);
        ref->seq = seq;
 
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, level, action,
                                   for_cow);
-       if (!need_ref_seq(for_cow, ref_root) &&
+       if (!is_fstree(ref_root) &&
            waitqueue_active(&delayed_refs->seq_wait))
                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
+
        return 0;
 }
 
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, owner, offset,
                                   action, for_cow);
-       if (!need_ref_seq(for_cow, ref_root) &&
+       if (!is_fstree(ref_root) &&
            waitqueue_active(&delayed_refs->seq_wait))
                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
+
        return 0;
 }
 
index d8f244d..413927f 100644 (file)
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
 
-struct seq_list {
-       struct list_head list;
-       u64 seq;
-};
-
 static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
 {
        assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
                            u64 seq);
 
 /*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
-       if (for_cow)
-               return 0;
-
-       if (rootid == BTRFS_FS_TREE_OBJECTID)
-               return 1;
-
-       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
-               return 1;
-
-       return 0;
-}
-
-/*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
  */
index e1fe74a..7ae51de 100644 (file)
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->orphan_block_rsv = NULL;
 
        INIT_LIST_HEAD(&root->dirty_list);
-       INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
+       atomic_set(&root->orphan_inodes, 0);
        root->log_batch = 0;
        root->log_transid = 0;
        root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                      BTRFS_TREE_LOG_OBJECTID, NULL,
-                                     0, 0, 0, 0);
+                                     0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
        spin_lock_init(&fs_info->free_chunk_lock);
+       spin_lock_init(&fs_info->tree_mod_seq_lock);
+       rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
 
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
+       INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        btrfs_mapping_init(&fs_info->mapping_tree);
        btrfs_init_block_rsv(&fs_info->global_block_rsv);
        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
        atomic_set(&fs_info->defrag_running, 0);
+       atomic_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
        fs_info->free_chunk_space = 0;
+       fs_info->tree_mod_log = RB_ROOT;
 
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
-       BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY,
+               &BTRFS_I(fs_info->btree_inode)->runtime_flags);
        insert_inode_hash(fs_info->btree_inode);
 
        spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
 
+       ret = btrfs_init_dev_stats(fs_info);
+       if (ret) {
+               printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+                      ret);
+               goto fail_block_groups;
+       }
+
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-       char b[BDEVNAME_SIZE];
-
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
+               struct btrfs_device *device = (struct btrfs_device *)
+                       bh->b_private;
+
                printk_ratelimited(KERN_WARNING "lost page write due to "
-                                       "I/O error on %s\n",
-                                      bdevname(bh->b_bdev, b));
+                                  "I/O error on %s\n", device->name);
                /* note, we dont' set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
                 */
                clear_buffer_uptodate(bh);
+               btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
        }
        unlock_buffer(bh);
        put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
                        set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
+                       bh->b_private = device;
                }
 
                /*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                }
                if (!bio_flagged(bio, BIO_UPTODATE)) {
                        ret = -EIO;
+                       if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+                               btrfs_dev_stat_inc_and_print(device,
+                                       BTRFS_DEV_STAT_FLUSH_ERRS);
                }
 
                /* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-/* Kill all outstanding I/O */
-void btrfs_abort_devices(struct btrfs_root *root)
-{
-       struct list_head *head;
-       struct btrfs_device *dev;
-       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-       head = &root->fs_info->fs_devices->devices;
-       list_for_each_entry_rcu(dev, head, dev_list) {
-               blk_abort_queue(dev->bdev->bd_disk->queue);
-       }
-       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-}
-
 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
        spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
        return 0;
 }
 
-static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
-                                         u64 start, u64 end,
-                                         struct extent_state *state)
-{
-       struct super_block *sb = page->mapping->host->i_sb;
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-       btrfs_error(fs_info, -EIO,
-                   "Error occured while writing out btree at %llu", start);
-       return -EIO;
-}
-
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
-       .writepage_io_failed_hook = btree_writepage_io_failed_hook,
 };
index ab1830a..05b3fab 100644 (file)
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 int btrfs_cleanup_transaction(struct btrfs_root *root);
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
                                  struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
index e887ee6..614f34a 100644 (file)
                                             parent_root_objectid) / 4)
 #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
 
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
-                          int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+                          struct inode *parent)
 {
        struct btrfs_fid *fid = (struct btrfs_fid *)fh;
-       struct inode *inode = dentry->d_inode;
        int len = *max_len;
        int type;
 
-       if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+       if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        fid->root_objectid = BTRFS_I(inode)->root->objectid;
        fid->gen = inode->i_generation;
 
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
+       if (parent) {
                u64 parent_root_id;
 
-               spin_lock(&dentry->d_lock);
-
-               parent = dentry->d_parent->d_inode;
                fid->parent_objectid = BTRFS_I(parent)->location.objectid;
                fid->parent_gen = parent->i_generation;
                parent_root_id = BTRFS_I(parent)->root->objectid;
 
-               spin_unlock(&dentry->d_lock);
-
                if (parent_root_id != fid->root_objectid) {
                        fid->parent_root_objectid = parent_root_id;
                        len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
index 49fd7b6..4b5a1e1 100644 (file)
@@ -3578,7 +3578,7 @@ again:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
-       mutex_unlock(&extent_root->fs_info->chunk_mutex);
+       mutex_unlock(&fs_info->chunk_mutex);
        return ret;
 }
 
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
        BTRFS_I(inode)->outstanding_extents--;
 
        if (BTRFS_I(inode)->outstanding_extents == 0 &&
-           BTRFS_I(inode)->delalloc_meta_reserved) {
+           test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                drop_inode_space = 1;
-               BTRFS_I(inode)->delalloc_meta_reserved = 0;
-       }
 
        /*
         * If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         * Add an item to reserve for updating the inode when we complete the
         * delalloc io.
         */
-       if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+       if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                     &BTRFS_I(inode)->runtime_flags)) {
                nr_extents++;
                extra_reserve = 1;
        }
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
        spin_lock(&BTRFS_I(inode)->lock);
        if (extra_reserve) {
-               BTRFS_I(inode)->delalloc_meta_reserved = 1;
+               set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                       &BTRFS_I(inode)->runtime_flags);
                nr_extents--;
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref, int for_cow)
+                          u64 parent, int last_ref)
 {
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                                        buf->start, buf->len,
                                        parent, root->root_key.objectid,
                                        btrfs_header_level(buf),
-                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
                BUG_ON(ret); /* -ENOMEM */
        }
 
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size, int for_cow)
+                                       u64 hint, u64 empty_size)
 {
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        ins.objectid,
                                        ins.offset, parent, root_objectid,
                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op, for_cow);
+                                       extent_op, 0);
                BUG_ON(ret); /* -ENOMEM */
        }
        return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
 
-       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
index c9018a0..2c8f7b2 100644 (file)
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                        return parent;
        }
 
-       entry = rb_entry(node, struct tree_entry, rb_node);
        rb_link_node(node, parent, p);
        rb_insert_color(node, root);
        return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
 
 /*
  * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1)
+ * it will optionally wake up any one waiting on this state (wake == 1).
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       clear_state_bit(tree, state, &bits, wake);
-                       if (last_end == (u64)-1)
-                               goto out;
-                       start = last_end + 1;
+                       state = clear_state_bit(tree, state, &bits, wake);
+                       goto next;
                }
                goto search_again;
        }
@@ -781,7 +778,6 @@ hit_next:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               struct rb_node *next_node;
                if (state->state & exclusive_bits) {
                        *failed_start = state->start;
                        err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
                }
 
                set_state_bits(tree, state, &bits);
-
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
-
                start = last_end + 1;
-               next_node = rb_next(&state->rb_node);
-               if (next_node && start < end && prealloc && !need_resched()) {
-                       state = rb_entry(next_node, struct extent_state,
-                                        rb_node);
-                       if (state->start == start)
-                               goto hit_next;
-               }
+               state = next_state(state);
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
                goto search_again;
        }
 
@@ -845,6 +836,10 @@ hit_next:
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
+                       state = next_state(state);
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
                }
                goto search_again;
        }
@@ -994,21 +989,14 @@ hit_next:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               struct rb_node *next_node;
-
                set_state_bits(tree, state, &bits);
-               clear_state_bit(tree, state, &clear_bits, 0);
+               state = clear_state_bit(tree, state, &clear_bits, 0);
                if (last_end == (u64)-1)
                        goto out;
-
                start = last_end + 1;
-               next_node = rb_next(&state->rb_node);
-               if (next_node && start < end && prealloc && !need_resched()) {
-                       state = rb_entry(next_node, struct extent_state,
-                                        rb_node);
-                       if (state->start == start)
-                               goto hit_next;
-               }
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
                goto search_again;
        }
 
@@ -1042,10 +1030,13 @@ hit_next:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, &bits);
-                       clear_state_bit(tree, state, &clear_bits, 0);
+                       state = clear_state_bit(tree, state, &clear_bits, 0);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
                }
                goto search_again;
        }
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                              cached_state, mask);
 }
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-                                u64 end, struct extent_state **cached_state,
-                                gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached_state, gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
                                cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
  * returned if we find something, and *start_ret and *end_ret are
  * set to reflect the state struct that was found.
  *
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
                          u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
                /* try to remap that extent elsewhere? */
                bio_put(bio);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
                return -EIO;
        }
 
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
                        uptodate = 0;
        }
 
-       if (!uptodate && tree->ops &&
-           tree->ops->writepage_io_failed_hook) {
-               ret = tree->ops->writepage_io_failed_hook(NULL, page,
-                                                start, end, NULL);
-               /* Writeback already completed */
-               if (ret == 0)
-                       return 1;
-       }
-
        if (!uptodate) {
-               clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
                ClearPageUptodate(page);
                SetPageError(page);
        }
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
                                                              state, mirror);
-                       if (ret)
+                       if (ret) {
+                               /* no IO indicated but software detected errors
+                                * in the block, either checksum errors or
+                                * issues with the contents */
+                               struct btrfs_root *root =
+                                       BTRFS_I(page->mapping->host)->root;
+                               struct btrfs_device *device;
+
                                uptodate = 0;
-                       else
+                               device = btrfs_find_device_for_logical(
+                                               root, start, mirror);
+                               if (device)
+                                       btrfs_dev_stat_inc_and_print(device,
+                                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
+                       } else {
                                clean_io_failure(start, page);
+                       }
                }
 
                if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
        u64 offset = eb->start;
        unsigned long i, num_pages;
        int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
-       int ret;
+       int ret = 0;
 
        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        eb->start = start;
        eb->len = len;
        eb->tree = tree;
+       eb->bflags = 0;
        rwlock_init(&eb->lock);
        atomic_set(&eb->write_locks, 0);
        atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        return eb;
 }
 
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+       unsigned long i;
+       struct page *p;
+       struct extent_buffer *new;
+       unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+       new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+       if (new == NULL)
+               return NULL;
+
+       for (i = 0; i < num_pages; i++) {
+               p = alloc_page(GFP_ATOMIC);
+               BUG_ON(!p);
+               attach_extent_buffer_page(new, p);
+               WARN_ON(PageDirty(p));
+               SetPageUptodate(p);
+               new->pages[i] = p;
+       }
+
+       copy_extent_buffer(new, src, 0, 0, src->len);
+       set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+       set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+       return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+       struct extent_buffer *eb;
+       unsigned long num_pages = num_extent_pages(0, len);
+       unsigned long i;
+
+       eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+       if (!eb)
+               return NULL;
+
+       for (i = 0; i < num_pages; i++) {
+               eb->pages[i] = alloc_page(GFP_ATOMIC);
+               if (!eb->pages[i])
+                       goto err;
+       }
+       set_extent_buffer_uptodate(eb);
+       btrfs_set_header_nritems(eb, 0);
+       set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+       return eb;
+err:
+       for (i--; i > 0; i--)
+               __free_page(eb->pages[i]);
+       __free_extent_buffer(eb);
+       return NULL;
+}
+
 static int extent_buffer_under_io(struct extent_buffer *eb)
 {
        return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                                                unsigned long start_idx)
 {
        unsigned long index;
+       unsigned long num_pages;
        struct page *page;
+       int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
 
        BUG_ON(extent_buffer_under_io(eb));
 
-       index = num_extent_pages(eb->start, eb->len);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       index = start_idx + num_pages;
        if (start_idx >= index)
                return;
 
        do {
                index--;
                page = extent_buffer_page(eb, index);
-               if (page) {
+               if (page && mapped) {
                        spin_lock(&page->mapping->private_lock);
                        /*
                         * We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                        }
                        spin_unlock(&page->mapping->private_lock);
 
+               }
+               if (page) {
                        /* One for when we alloced the page */
                        page_cache_release(page);
                }
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 {
        WARN_ON(atomic_read(&eb->refs) == 0);
        if (atomic_dec_and_test(&eb->refs)) {
-               struct extent_io_tree *tree = eb->tree;
+               if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+                       spin_unlock(&eb->refs_lock);
+               } else {
+                       struct extent_io_tree *tree = eb->tree;
 
-               spin_unlock(&eb->refs_lock);
+                       spin_unlock(&eb->refs_lock);
 
-               spin_lock(&tree->buffer_lock);
-               radix_tree_delete(&tree->buffer,
-                                 eb->start >> PAGE_CACHE_SHIFT);
-               spin_unlock(&tree->buffer_lock);
+                       spin_lock(&tree->buffer_lock);
+                       radix_tree_delete(&tree->buffer,
+                                         eb->start >> PAGE_CACHE_SHIFT);
+                       spin_unlock(&tree->buffer_lock);
+               }
 
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb, 0);
@@ -4260,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)
 
        spin_lock(&eb->refs_lock);
        if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+               atomic_dec(&eb->refs);
+
+       if (atomic_read(&eb->refs) == 2 &&
            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
            !extent_buffer_under_io(eb) &&
            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
index b516c3b..25900af 100644 (file)
@@ -39,6 +39,7 @@
 #define EXTENT_BUFFER_STALE 6
 #define EXTENT_BUFFER_WRITEBACK 7
 #define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-       int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
-                                       u64 start, u64 end,
-                                      struct extent_state *state);
        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
                                    struct extent_state *state, int mirror);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
index 53bf2d7..70dc8ca 100644 (file)
@@ -65,6 +65,21 @@ struct inode_defrag {
        int cycled;
 };
 
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+                                 struct inode_defrag *defrag2)
+{
+       if (defrag1->root > defrag2->root)
+               return 1;
+       else if (defrag1->root < defrag2->root)
+               return -1;
+       else if (defrag1->ino > defrag2->ino)
+               return 1;
+       else if (defrag1->ino < defrag2->ino)
+               return -1;
+       else
+               return 0;
+}
+
 /* pop a record for an inode into the defrag tree.  The lock
  * must be held already
  *
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
        struct inode_defrag *entry;
        struct rb_node **p;
        struct rb_node *parent = NULL;
+       int ret;
 
        p = &root->fs_info->defrag_inodes.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
 
-               if (defrag->ino < entry->ino)
+               ret = __compare_inode_defrag(defrag, entry);
+               if (ret < 0)
                        p = &parent->rb_left;
-               else if (defrag->ino > entry->ino)
+               else if (ret > 0)
                        p = &parent->rb_right;
                else {
                        /* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,