Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound-2.6
Linus Torvalds [Tue, 6 Jan 2009 02:34:12 +0000 (18:34 -0800)]
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound-2.6:
  ALSA: ice1724 - Fix a typo in IEC958 PCM name
  ASoC: fix davinci-sffsdr buglet
  ALSA: sound/usb: Use negated usb_endpoint_xfer_control, etc
  ALSA: hda - cxt5051 report jack state
  ALSA: hda - add basic jack reporting functions to patch_conexant.c
  ALSA: Use usb_set/get_intfdata
  ASoC: Clean up kerneldoc warnings
  ASoC: Fix pxa2xx-pcm checks for invalid DMA channels
  LSA: hda - Add HP Acacia detection
  ALSA: hda - fix name for ALC1200
  ALSA: sound/usb: use USB API functions rather than constants
  ASoC: TWL4030: DAPM based capture implementation
  ASoC: TWL4030: Make the enum filter generic for twl4030

147 files changed:
Documentation/filesystems/ocfs2.txt
arch/powerpc/platforms/cell/spufs/inode.c
arch/s390/hypfs/inode.c
arch/um/Makefile
arch/um/include/asm/system.h
arch/x86/Kconfig.cpu
arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
drivers/cpufreq/cpufreq.c
drivers/infiniband/hw/ipath/ipath_fs.c
drivers/isdn/capi/capifs.c
drivers/message/i2o/exec-osm.c
drivers/message/i2o/i2o_config.c
drivers/message/i2o/iop.c
drivers/message/i2o/pci.c
drivers/misc/ibmasm/ibmasmfs.c
drivers/oprofile/oprofilefs.c
drivers/usb/core/inode.c
drivers/usb/gadget/file_storage.c
drivers/usb/gadget/inode.c
fs/Kconfig
fs/Makefile
fs/affs/inode.c
fs/autofs/inode.c
fs/autofs4/inode.c
fs/binfmt_misc.c
fs/cifs/inode.c
fs/coda/file.c
fs/configfs/inode.c
fs/cramfs/inode.c
fs/debugfs/inode.c
fs/devpts/inode.c
fs/dquot.c
fs/ecryptfs/file.c
fs/ecryptfs/inode.c
fs/exec.c
fs/ext3/super.c
fs/ext4/super.c
fs/filesystems.c
fs/hugetlbfs/inode.c
fs/inode.c
fs/isofs/inode.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/jfs/jfs_imap.c
fs/libfs.c
fs/namei.c
fs/nfsd/vfs.c
fs/notify/inotify/inotify_user.c
fs/ntfs/inode.c
fs/ocfs2/Makefile
fs/ocfs2/acl.c [new file with mode: 0644]
fs/ocfs2/acl.h [new file with mode: 0644]
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/blockcheck.c [new file with mode: 0644]
fs/ocfs2/blockcheck.h [new file with mode: 0644]
fs/ocfs2/buffer_head_io.c
fs/ocfs2/buffer_head_io.h
fs/ocfs2/cluster/masklog.c
fs/ocfs2/cluster/masklog.h
fs/ocfs2/dir.c
fs/ocfs2/dir.h
fs/ocfs2/dlm/dlmast.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmfs.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmthread.c
fs/ocfs2/dlmglue.c
fs/ocfs2/dlmglue.h
fs/ocfs2/extent_map.c
fs/ocfs2/extent_map.h
fs/ocfs2/file.c
fs/ocfs2/file.h
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/localalloc.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_jbd_compat.h [deleted file]
fs/ocfs2/ocfs2_lockid.h
fs/ocfs2/quota.h [new file with mode: 0644]
fs/ocfs2/quota_global.c [new file with mode: 0644]
fs/ocfs2/quota_local.c [new file with mode: 0644]
fs/ocfs2/resize.c
fs/ocfs2/slot_map.c
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/ocfs2/symlink.c
fs/ocfs2/xattr.c
fs/ocfs2/xattr.h
fs/omfs/inode.c
fs/open.c
fs/openpromfs/inode.c
fs/proc/base.c
fs/proc/proc_sysctl.c
fs/quota.c
fs/quota_tree.c [new file with mode: 0644]
fs/quota_tree.h [new file with mode: 0644]
fs/quota_v1.c
fs/quota_v2.c
fs/quotaio_v1.h [moved from include/linux/quotaio_v1.h with 100% similarity]
fs/quotaio_v2.h [moved from include/linux/quotaio_v2.h with 68% similarity]
fs/ramfs/inode.c
fs/read_write.c
fs/reiserfs/inode.c
fs/reiserfs/super.c
fs/romfs/inode.c
fs/stat.c
fs/sync.c
fs/sysfs/inode.c
fs/xattr.c
include/linux/Kbuild
include/linux/cpufreq.h
include/linux/dqblk_qtree.h [new file with mode: 0644]
include/linux/dqblk_v1.h
include/linux/dqblk_v2.h
include/linux/fs.h
include/linux/jbd2.h
include/linux/journal-head.h
include/linux/quota.h
include/linux/quotaops.h
include/linux/radix-tree.h
include/linux/syscalls.h
ipc/mqueue.c
ipc/sem.c
kernel/cgroup.c
mm/filemap.c
mm/memory.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/msync.c
mm/nommu.c
net/sunrpc/rpc_pipe.c
security/commoncap.c
security/inode.c
security/selinux/selinuxfs.c

index 67310fb..c2a0871 100644 (file)
@@ -31,7 +31,6 @@ Features which OCFS2 does not support yet:
        - quotas
        - Directory change notification (F_NOTIFY)
        - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
-       - POSIX ACLs
 
 Mount options
 =============
@@ -79,3 +78,5 @@ inode64                       Indicates that Ocfs2 is allowed to create inodes at
                        bits of significance.
 user_xattr     (*)     Enables Extended User Attributes.
 nouser_xattr           Disables Extended User Attributes.
+acl                    Enables POSIX Access Control Lists support.
+noacl          (*)     Disables POSIX Access Control Lists support.
index 6296bfd..e309ef7 100644 (file)
@@ -97,7 +97,6 @@ spufs_new_inode(struct super_block *sb, int mode)
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-       inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 out:
        return inode;
index 9d4f8e6..5a805df 100644 (file)
@@ -106,7 +106,6 @@ static struct inode *hypfs_make_inode(struct super_block *sb, int mode)
                ret->i_mode = mode;
                ret->i_uid = hypfs_info->uid;
                ret->i_gid = hypfs_info->gid;
-               ret->i_blocks = 0;
                ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
                if (mode & S_IFDIR)
                        ret->i_nlink = 2;
index d944c34..0728def 100644 (file)
@@ -22,10 +22,11 @@ MODE_INCLUDE        += -I$(srctree)/$(ARCH_DIR)/include/shared/skas
 
 include $(srctree)/$(ARCH_DIR)/Makefile-skas
 
-ARCH_INCLUDE   := -I$(srctree)/$(ARCH_DIR)/include/shared
+SHARED_HEADERS := $(ARCH_DIR)/include/shared
+ARCH_INCLUDE   := -I$(srctree)/$(SHARED_HEADERS)
 ARCH_INCLUDE   += -I$(srctree)/$(ARCH_DIR)/sys-$(SUBARCH)/shared
 ifneq ($(KBUILD_SRC),)
-ARCH_INCLUDE   += -I$(ARCH_DIR)/include/shared # for two generated files
+ARCH_INCLUDE   += -I$(SHARED_HEADERS)
 endif
 KBUILD_CPPFLAGS += -I$(srctree)/$(ARCH_DIR)/sys-$(SUBARCH)
 
@@ -85,8 +86,8 @@ endef
 
 KBUILD_KCONFIG := arch/um/Kconfig.$(HEADER_ARCH)
 
-archprepare: $(ARCH_DIR)/include/shared/user_constants.h
-prepare: $(ARCH_DIR)/include/shared/kern_constants.h
+archprepare: $(SHARED_HEADERS)/user_constants.h
+archprepare: $(SHARED_HEADERS)/kern_constants.h
 
 LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static
 LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib
@@ -119,17 +120,13 @@ endef
 # When cleaning we don't include .config, so we don't include
 # TT or skas makefiles and don't clean skas_ptregs.h.
 CLEAN_FILES += linux x.i gmon.out \
-       $(ARCH_DIR)/include/shared/user_constants.h \
-       $(ARCH_DIR)/include/shared/kern_constants.h
+       $(SHARED_HEADERS)/user_constants.h \
+       $(SHARED_HEADERS)/kern_constants.h
 
 archclean:
        @find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \
                -o -name '*.gcov' \) -type f -print | xargs rm -f
 
-$(objtree)/$(ARCH_DIR)/include/shared:
-       @echo '  MKDIR $@'
-       $(Q)mkdir -p $@
-
 # Generated files
 
 $(ARCH_DIR)/sys-$(SUBARCH)/user-offsets.s: FORCE
@@ -148,11 +145,11 @@ define filechk_gen-asm-offsets
          echo ""; )
 endef
 
-$(ARCH_DIR)/include/shared/user_constants.h: $(ARCH_DIR)/sys-$(SUBARCH)/user-offsets.s
+$(SHARED_HEADERS)/user_constants.h: $(ARCH_DIR)/sys-$(SUBARCH)/user-offsets.s
        $(call filechk,gen-asm-offsets)
 
-$(ARCH_DIR)/include/shared/kern_constants.h: $(objtree)/$(ARCH_DIR)/include/shared
-       @echo '  SYMLINK $@'
-       $(Q)ln -sf ../../../../include/asm/asm-offsets.h $@
+$(SHARED_HEADERS)/kern_constants.h:
+       $(Q)mkdir -p $(dir $@)
+       $(Q)echo '#include "../../../../include/asm/asm-offsets.h"' >$@
 
 export SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING OS HEADER_ARCH DEV_NULL_PATH
index ae5f94d..753346e 100644 (file)
@@ -11,21 +11,21 @@ extern int get_signals(void);
 extern void block_signals(void);
 extern void unblock_signals(void);
 
-#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
+#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
                                     (flags) = get_signals(); } while(0)
-#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
+#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
                                      set_signals(flags); } while(0)
 
-#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
-                                   raw_local_irq_disable(); } while(0)
+#define local_irq_save(flags) do { local_save_flags(flags); \
+                                   local_irq_disable(); } while(0)
 
-#define raw_local_irq_enable() unblock_signals()
-#define raw_local_irq_disable() block_signals()
+#define local_irq_enable() unblock_signals()
+#define local_irq_disable() block_signals()
 
 #define irqs_disabled()                 \
 ({                                      \
         unsigned long flags;            \
-        raw_local_save_flags(flags);        \
+        local_save_flags(flags);        \
         (flags == 0);                   \
 })
 
index 85a7857..8078955 100644 (file)
@@ -408,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY
 
 config X86_DEBUGCTLMSR
        def_bool y
-       depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
+       depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
 
 menuconfig PROCESSOR_SELECT
        bool "Supported processor vendors" if EMBEDDED
index b8e05ee..beea446 100644 (file)
@@ -160,6 +160,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
                switch (c->x86_model) {
                case 0x0E: /* Core */
                case 0x0F: /* Core Duo */
+               case 0x16: /* Celeron Core */
                        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
                        return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
                case 0x0D: /* Pentium M (Dothan) */
@@ -171,7 +172,9 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
        }
 
        if (c->x86 != 0xF) {
-               printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
+               if (!cpu_has(c, X86_FEATURE_EST))
+                       printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. "
+                               "Please send an e-mail to <cpufreq@vger.kernel.org>\n");
                return 0;
        }
 
@@ -274,6 +277,7 @@ static struct cpufreq_driver p4clockmod_driver = {
        .name           = "p4-clockmod",
        .owner          = THIS_MODULE,
        .attr           = p4clockmod_attr,
+       .hide_interface = 1,
 };
 
 
index 3b5f064..f0ea6fa 100644 (file)
@@ -459,9 +459,7 @@ static int centrino_verify (struct cpufreq_policy *policy)
  * Sets a new CPUFreq policy.
  */
 struct allmasks {
-       cpumask_t               online_policy_cpus;
        cpumask_t               saved_mask;
-       cpumask_t               set_mask;
        cpumask_t               covered_cpus;
 };
 
@@ -475,9 +473,7 @@ static int centrino_target (struct cpufreq_policy *policy,
        int                     retval = 0;
        unsigned int            j, k, first_cpu, tmp;
        CPUMASK_ALLOC(allmasks);
-       CPUMASK_PTR(online_policy_cpus, allmasks);
        CPUMASK_PTR(saved_mask, allmasks);
-       CPUMASK_PTR(set_mask, allmasks);
        CPUMASK_PTR(covered_cpus, allmasks);
 
        if (unlikely(allmasks == NULL))
@@ -497,30 +493,28 @@ static int centrino_target (struct cpufreq_policy *policy,
                goto out;
        }
 
-#ifdef CONFIG_HOTPLUG_CPU
-       /* cpufreq holds the hotplug lock, so we are safe from here on */
-       cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
-#else
-       *online_policy_cpus = policy->cpus;
-#endif
-
        *saved_mask = current->cpus_allowed;
        first_cpu = 1;
        cpus_clear(*covered_cpus);
-       for_each_cpu_mask_nr(j, *online_policy_cpus) {
+       for_each_cpu_mask_nr(j, policy->cpus) {
+               const cpumask_t *mask;
+
+               /* cpufreq holds the hotplug lock, so we are safe here */
+               if (!cpu_online(j))
+                       continue;
+
                /*
                 * Support for SMP systems.
                 * Make sure we are running on CPU that wants to change freq
                 */
-               cpus_clear(*set_mask);
                if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-                       cpus_or(*set_mask, *set_mask, *online_policy_cpus);
+                       mask = &policy->cpus;
                else
-                       cpu_set(j, *set_mask);
+                       mask = &cpumask_of_cpu(j);
 
-               set_cpus_allowed_ptr(current, set_mask);
+               set_cpus_allowed_ptr(current, mask);
                preempt_disable();
-               if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
+               if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
                        dprintk("couldn't limit to CPUs in this domain\n");
                        retval = -EAGAIN;
                        if (first_cpu) {
@@ -548,7 +542,9 @@ static int centrino_target (struct cpufreq_policy *policy,
                        dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
                                target_freq, freqs.old, freqs.new, msr);
 
-                       for_each_cpu_mask_nr(k, *online_policy_cpus) {
+                       for_each_cpu_mask_nr(k, policy->cpus) {
+                               if (!cpu_online(k))
+                                       continue;
                                freqs.cpu = k;
                                cpufreq_notify_transition(&freqs,
                                        CPUFREQ_PRECHANGE);
@@ -571,7 +567,9 @@ static int centrino_target (struct cpufreq_policy *policy,
                preempt_enable();
        }
 
-       for_each_cpu_mask_nr(k, *online_policy_cpus) {
+       for_each_cpu_mask_nr(k, policy->cpus) {
+               if (!cpu_online(k))
+                       continue;
                freqs.cpu = k;
                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
        }
@@ -584,18 +582,17 @@ static int centrino_target (struct cpufreq_policy *policy,
                 * Best effort undo..
                 */
 
-               if (!cpus_empty(*covered_cpus))
-                       for_each_cpu_mask_nr(j, *covered_cpus) {
-                               set_cpus_allowed_ptr(current,
-                                                    &cpumask_of_cpu(j));
-                               wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
-                       }
+               for_each_cpu_mask_nr(j, *covered_cpus) {
+                       set_cpus_allowed_ptr(current, &cpumask_of_cpu(j));
+                       wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
+               }
 
                tmp = freqs.new;
                freqs.new = freqs.old;
                freqs.old = tmp;
-               for_each_cpu_mask_nr(j, *online_policy_cpus) {
-                       freqs.cpu = j;
+               for_each_cpu_mask_nr(j, policy->cpus) {
+                       if (!cpu_online(j))
+                               continue;
                        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
                        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
                }
index 98d4fdb..cdac7d6 100644 (file)
@@ -139,6 +139,15 @@ static unsigned int pentium_core_get_frequency(void)
        case 3:
                fsb = 166667;
                break;
+       case 2:
+               fsb = 200000;
+               break;
+       case 0:
+               fsb = 266667;
+               break;
+       case 4:
+               fsb = 333333;
+               break;
        default:
                printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
        }
index 31d6f53..01dde80 100644 (file)
@@ -754,6 +754,11 @@ static struct kobj_type ktype_cpufreq = {
        .release        = cpufreq_sysfs_release,
 };
 
+static struct kobj_type ktype_empty_cpufreq = {
+       .sysfs_ops      = &sysfs_ops,
+       .release        = cpufreq_sysfs_release,
+};
+
 
 /**
  * cpufreq_add_dev - add a CPU device
@@ -822,8 +827,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
                dprintk("initialization failed\n");
                goto err_out;
        }
-       policy->user_policy.min = policy->cpuinfo.min_freq;
-       policy->user_policy.max = policy->cpuinfo.max_freq;
+       policy->user_policy.min = policy->min;
+       policy->user_policy.max = policy->max;
 
        blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
                                     CPUFREQ_START, policy);
@@ -876,26 +881,36 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
        memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
 
        /* prepare interface data */
-       ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,
-                                  "cpufreq");
-       if (ret)
-               goto err_out_driver_exit;
-
-       /* set up files for this cpu device */
-       drv_attr = cpufreq_driver->attr;
-       while ((drv_attr) && (*drv_attr)) {
-               ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
+       if (!cpufreq_driver->hide_interface) {
+               ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
+                                          &sys_dev->kobj, "cpufreq");
                if (ret)
                        goto err_out_driver_exit;
-               drv_attr++;
-       }
-       if (cpufreq_driver->get) {
-               ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
-               if (ret)
-                       goto err_out_driver_exit;
-       }
-       if (cpufreq_driver->target) {
-               ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
+
+               /* set up files for this cpu device */
+               drv_attr = cpufreq_driver->attr;
+               while ((drv_attr) && (*drv_attr)) {
+                       ret = sysfs_create_file(&policy->kobj,
+                                               &((*drv_attr)->attr));
+                       if (ret)
+                               goto err_out_driver_exit;
+                       drv_attr++;
+               }
+               if (cpufreq_driver->get) {
+                       ret = sysfs_create_file(&policy->kobj,
+                                               &cpuinfo_cur_freq.attr);
+                       if (ret)
+                               goto err_out_driver_exit;
+               }
+               if (cpufreq_driver->target) {
+                       ret = sysfs_create_file(&policy->kobj,
+                                               &scaling_cur_freq.attr);
+                       if (ret)
+                               goto err_out_driver_exit;
+               }
+       } else {
+               ret = kobject_init_and_add(&policy->kobj, &ktype_empty_cpufreq,
+                                          &sys_dev->kobj, "cpufreq");
                if (ret)
                        goto err_out_driver_exit;
        }
index 53912c3..8dc2bb7 100644 (file)
@@ -57,9 +57,6 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
        }
 
        inode->i_mode = mode;
-       inode->i_uid = 0;
-       inode->i_gid = 0;
-       inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_private = data;
        if ((mode & S_IFMT) == S_IFDIR) {
index 0aa66ec..b129409 100644 (file)
@@ -111,8 +111,6 @@ capifs_fill_super(struct super_block *s, void *data, int silent)
                goto fail;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-       inode->i_blocks = 0;
-       inode->i_uid = inode->i_gid = 0;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
index 56faef1..06c655c 100644 (file)
@@ -19,7 +19,7 @@
  *             Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
  *             Deepak Saxena <deepak@plexity.net>
  *             Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *             Alan Cox <alan@redhat.com>:
+ *             Alan Cox <alan@lxorguk.ukuu.org.uk>:
  *                     Ported to Linux 2.5.
  *             Markus Lidel <Markus.Lidel@shadowconnect.com>:
  *                     Minor fixes for 2.6.
index f3384c3..efba702 100644 (file)
@@ -19,7 +19,7 @@
  *             Changed ioctl_swdl(), implemented ioctl_swul() and ioctl_swdel()
  *     Deepak Saxena (11/18/1999):
  *             Added event managmenet support
- *     Alan Cox <alan@redhat.com>:
+ *     Alan Cox <alan@lxorguk.ukuu.org.uk>:
  *             2.4 rewrite ported to 2.5
  *     Markus Lidel <Markus.Lidel@shadowconnect.com>:
  *             Added pass-thru support for Adaptec's raidutils
index 6e53a30..35c67d1 100644 (file)
@@ -19,7 +19,7 @@
  *             Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
  *             Deepak Saxena <deepak@plexity.net>
  *             Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *             Alan Cox <alan@redhat.com>:
+ *             Alan Cox <alan@lxorguk.ukuu.org.uk>:
  *                     Ported to Linux 2.5.
  *             Markus Lidel <Markus.Lidel@shadowconnect.com>:
  *                     Minor fixes for 2.6.
index 610ef12..25d6f23 100644 (file)
@@ -19,7 +19,7 @@
  *             Auvo Häkkinen <Auvo.Hakkinen@cs.Helsinki.FI>
  *             Deepak Saxena <deepak@plexity.net>
  *             Boji T Kannanthanam <boji.t.kannanthanam@intel.com>
- *             Alan Cox <alan@redhat.com>:
+ *             Alan Cox <alan@lxorguk.ukuu.org.uk>:
  *                     Ported to Linux 2.5.
  *             Markus Lidel <Markus.Lidel@shadowconnect.com>:
  *                     Minor fixes for 2.6.
index 22a7e8b..de966a6 100644 (file)
@@ -146,8 +146,6 @@ static struct inode *ibmasmfs_make_inode(struct super_block *sb, int mode)
 
        if (ret) {
                ret->i_mode = mode;
-               ret->i_uid = ret->i_gid = 0;
-               ret->i_blocks = 0;
                ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
        }
        return ret;
index ddc4c59..b7e4cee 100644 (file)
@@ -29,9 +29,6 @@ static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode)
 
        if (inode) {
                inode->i_mode = mode;
-               inode->i_uid = 0;
-               inode->i_gid = 0;
-               inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        }
        return inode;
index 185be76..2a129cb 100644 (file)
@@ -279,7 +279,6 @@ static struct inode *usbfs_get_inode (struct super_block *sb, int mode, dev_t de
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-               inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
                default:
index c4e62a6..2e71368 100644 (file)
@@ -1863,26 +1863,10 @@ static int do_write(struct fsg_dev *fsg)
 static int fsync_sub(struct lun *curlun)
 {
        struct file     *filp = curlun->filp;
-       struct inode    *inode;
-       int             rc, err;
 
        if (curlun->ro || !filp)
                return 0;
-       if (!filp->f_op->fsync)
-               return -EINVAL;
-
-       inode = filp->f_path.dentry->d_inode;
-       mutex_lock(&inode->i_mutex);
-       rc = filemap_fdatawrite(inode->i_mapping);
-       err = filp->f_op->fsync(filp, filp->f_path.dentry, 1);
-       if (!rc)
-               rc = err;
-       err = filemap_fdatawait(inode->i_mapping);
-       if (!rc)
-               rc = err;
-       mutex_unlock(&inode->i_mutex);
-       VLDBG(curlun, "fdatasync -> %d\n", rc);
-       return rc;
+       return vfs_fsync(filp, filp->f_path.dentry, 1);
 }
 
 static void fsync_all(struct fsg_dev *fsg)
index eeb26c0..317b48f 100644 (file)
@@ -2001,7 +2001,6 @@ gadgetfs_make_inode (struct super_block *sb,
                inode->i_mode = mode;
                inode->i_uid = default_uid;
                inode->i_gid = default_gid;
-               inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime
                                = CURRENT_TIME;
                inode->i_private = data;
index ff0e819..f9b6e29 100644 (file)
@@ -189,6 +189,8 @@ config OCFS2_FS
        select CONFIGFS_FS
        select JBD2
        select CRC32
+       select QUOTA
+       select QUOTA_TREE
        help
          OCFS2 is a general purpose extent based shared disk cluster file
          system with many similarities to ext3. It supports 64 bit inode
@@ -258,15 +260,14 @@ config OCFS2_DEBUG_FS
          this option for debugging only as it is likely to decrease
          performance of the filesystem.
 
-config OCFS2_COMPAT_JBD
-       bool "Use JBD for compatibility"
+config OCFS2_FS_POSIX_ACL
+       bool "OCFS2 POSIX Access Control Lists"
        depends on OCFS2_FS
+       select FS_POSIX_ACL
        default n
-       select JBD
        help
-         The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
-         is backwards compatible with JBD.  It is safe to say N here.
-         However, if you really want to use the original JBD, say Y here.
+         Posix Access Control Lists (ACLs) support permissions for users and
+         groups beyond the owner/group/world scheme.
 
 endif # BLOCK
 
@@ -303,6 +304,10 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
 
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+        tristate
+
 config QFMT_V1
        tristate "Old quota format support"
        depends on QUOTA
@@ -314,6 +319,7 @@ config QFMT_V1
 config QFMT_V2
        tristate "Quota format v2 support"
        depends on QUOTA
+       select QUOTA_TREE
        help
          This quota format allows using quotas with 32-bit UIDs/GIDs. If you
          need this functionality say Y here.
index e6f423d..c830611 100644 (file)
@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL)     += generic_acl.o
 obj-$(CONFIG_QUOTA)            += dquot.o
 obj-$(CONFIG_QFMT_V1)          += quota_v1.o
 obj-$(CONFIG_QFMT_V2)          += quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)       += quota_tree.o
 obj-$(CONFIG_QUOTACTL)         += quota.o
 
 obj-$(CONFIG_PROC_FS)          += proc/
index 415d9c6..3c4ec7d 100644 (file)
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
                goto bad_inode;
 #else
                inode->i_mode |= S_IFDIR;
-               inode->i_op = NULL;
-               inode->i_fop = NULL;
+               /* ... and leave ->i_op and ->i_fop pointing to empty */
                break;
 #endif
        case ST_LINKFILE:
index c773680..e1734f2 100644 (file)
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_nlink = 2;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-       inode->i_blocks = 0;
 
        if (ino == AUTOFS_ROOT_INO) {
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
                inode->i_op = &autofs_root_inode_operations;
                inode->i_fop = &autofs_root_operations;
-               inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
                goto done;
        } 
        
index 7b19802..cfc23e5 100644 (file)
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
-       } else {
-               inode->i_uid = 0;
-               inode->i_gid = 0;
        }
-       inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
        if (S_ISDIR(inf->mode)) {
index f2744ab..e1158cb 100644 (file)
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
        if (inode) {
                inode->i_mode = mode;
-               inode->i_uid = 0;
-               inode->i_gid = 0;
-               inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
        }
index f247da9..5ab9896 100644 (file)
@@ -1641,7 +1641,7 @@ do_expand:
        i_size_write(inode, offset);
        spin_unlock(&inode->i_lock);
 out_truncate:
-       if (inode->i_op && inode->i_op->truncate)
+       if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 out_sig:
index 466303d..6a347fb 100644 (file)
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 {
        struct file *host_file;
-       struct dentry *host_dentry;
-       struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
+       struct inode *coda_inode = coda_dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
 
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
 
-       if (host_file->f_op && host_file->f_op->fsync) {
-               host_dentry = host_file->f_path.dentry;
-               host_inode = host_dentry->d_inode;
-               mutex_lock(&host_inode->i_mutex);
-               err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-               mutex_unlock(&host_inode->i_mutex);
-       }
-
+       err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
index 4803ccc..5d349d3 100644 (file)
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-       inode->i_uid = 0;
-       inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
 
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
-               inode->i_blocks = 0;
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
index f40423e..a07338d 100644 (file)
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_data.a_ops = &cramfs_aops;
                } else {
-                       inode->i_size = 0;
-                       inode->i_blocks = 0;
                        init_special_inode(inode, inode->i_mode,
                                old_decode_dev(cramfs_inode->size));
                }
index 3dbe216..81ae9ea 100644 (file)
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 
        if (inode) {
                inode->i_mode = mode;
-               inode->i_uid = 0;
-               inode->i_gid = 0;
-               inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
                default:
index fff96e1..5f3231b 100644 (file)
@@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb)
        }
 
        inode->i_ino = 2;
-       inode->i_uid = inode->i_gid = 0;
-       inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 
        mode = S_IFCHR|opts->ptmxmode;
@@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
                goto free_fsi;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-       inode->i_blocks = 0;
-       inode->i_uid = inode->i_gid = 0;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
index c237ccc..61bfff6 100644 (file)
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
 
 struct dqstats dqstats;
 
-static void dqput(struct dquot *dquot);
-
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -415,6 +413,17 @@ out_dqlock:
        return ret;
 }
 
+void dquot_destroy(struct dquot *dquot)
+{
+       kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+       dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
 /* Invalidate all dquots on the list. Note that this function is called after
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-               kmem_cache_free(dquot_cachep, dquot);
+               do_destroy_dquot(dquot);
+       }
+       spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+                     int (*fn)(struct dquot *dquot, unsigned long priv),
+                     unsigned long priv)
+{
+       struct dquot *dquot, *old_dquot = NULL;
+       int ret = 0;
+
+       mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+       spin_lock(&dq_list_lock);
+       list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+               if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+                       continue;
+               if (dquot->dq_sb != sb)
+                       continue;
+               /* Now we have active dquot so we can just increase use count */
+               atomic_inc(&dquot->dq_count);
+               dqstats.lookups++;
+               spin_unlock(&dq_list_lock);
+               dqput(old_dquot);
+               old_dquot = dquot;
+               ret = fn(dquot, priv);
+               if (ret < 0)
+                       goto out;
+               spin_lock(&dq_list_lock);
+               /* We are safe to continue now because our dquot could not
+                * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
+out:
+       dqput(old_dquot);
+       mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+       return ret;
 }
 
 int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-               if (!sb_has_quota_enabled(sb, cnt))
+               if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
        }
 
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-               if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
-                       && info_dirty(&dqopt->info[cnt]))
+               if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+                   && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        spin_lock(&dq_list_lock);
        dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-               kmem_cache_free(dquot_cachep, dquot);
+               do_destroy_dquot(dquot);
                count--;
                head = free_dquots.prev;
        }
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
  * NOTE: If you change this function please check whether dqput_blocks() works right...
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
        int ret;
 
@@ -584,7 +628,7 @@ we_slept:
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
-               if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+               if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot->dq_wait_unused);
                spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
        spin_unlock(&dq_list_lock);
 }
 
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+       return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
        struct dquot *dquot;
 
-       dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+       dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NODQUOT;
 
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 }
 
 /*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+       unsigned int hashent = hashfn(sb, id, type);
+       int ret = 0;
+
+        if (!sb_has_quota_active(sb, type))
+               return 0;
+       spin_lock(&dq_list_lock);
+       if (find_dquot(hashent, sb, id, type) != NODQUOT)
+               ret = 1;
+       spin_unlock(&dq_list_lock);
+       return ret;
+}
+
+/*
  * Get reference to dquot
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
        unsigned int hashent = hashfn(sb, id, type);
        struct dquot *dquot, *empty = NODQUOT;
 
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
                return NODQUOT;
 we_slept:
        spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
                if (empty)
-                       kmem_cache_free(dquot_cachep, empty);
+                       do_destroy_dquot(empty);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is already
         * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
        }
 }
 
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
        dquot->dq_dqb.dqb_curinodes += number;
 }
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
        dquot->dq_dqb.dqb_curspace += number;
 }
 
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
-       if (dquot->dq_dqb.dqb_curinodes > number)
+       if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+           dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
 
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
-       if (dquot->dq_dqb.dqb_curspace > number)
+       if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+           dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
-       if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+       if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-       if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+       if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+           test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
 
        if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-       if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+       if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+           test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
 
        if (dquot->dq_dqb.dqb_bhardlimit &&
-          toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+           dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
                if (!prealloc)
                        *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
 
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-          toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+           dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
                if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
 
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-          toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+           dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (!prealloc) {
                        *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        return QUOTA_OK;
 }
 
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-           dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+           dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+           !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
                return QUOTA_NL_NOWARN;
 
        if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-           toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+           dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;
 
-       if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
-           dquot->dq_dqb.dqb_bsoftlimit)
+       if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
-       if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace - space) <
-                                               dquot->dq_dqb.dqb_bhardlimit)
+       if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+           dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
 }
@@ -1166,17 +1237,23 @@ out_err:
  *     Release all quotas referenced by inode
  *     Transaction must be started at an entry
  */
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
 {
        int cnt;
 
-       down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (inode->i_dquot[cnt] != NODQUOT) {
                        dqput(inode->i_dquot[cnt]);
                        inode->i_dquot[cnt] = NODQUOT;
                }
        }
+       return 0;
+}
+
+int dquot_drop(struct inode *inode)
+{
+       down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+       dquot_drop_locked(inode);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return 0;
 }
@@ -1264,7 +1341,7 @@ warn_put_all:
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
        int cnt, ret = NO_QUOTA;
        char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
-       if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+       if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
                vfs_dq_init(inode);
                if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
                        return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
        .acquire_dquot  = dquot_acquire,
        .release_dquot  = dquot_release,
        .mark_dirty     = dquot_mark_dquot_dirty,
-       .write_info     = dquot_commit_info
+       .write_info     = dquot_commit_info,
+       .alloc_dquot    = dquot_alloc,
+       .destroy_dquot  = dquot_destroy,
 };
 
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-       switch (type) {
-               case USRQUOTA:
-                       dqopt->flags |= DQUOT_USR_ENABLED;
-                       dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                       break;
-               case GRPQUOTA:
-                       dqopt->flags |= DQUOT_GRP_ENABLED;
-                       dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                       break;
-       }
-}
-
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-                                     int remount)
-{
-       switch (type) {
-               case USRQUOTA:
-                       dqopt->flags &= ~DQUOT_USR_ENABLED;
-                       if (remount)
-                               dqopt->flags |= DQUOT_USR_SUSPENDED;
-                       else
-                               dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                       break;
-               case GRPQUOTA:
-                       dqopt->flags &= ~DQUOT_GRP_ENABLED;
-                       if (remount)
-                               dqopt->flags |= DQUOT_GRP_SUSPENDED;
-                       else
-                               dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                       break;
-       }
-}
-
-
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *toputinode[MAXQUOTAS];
 
+       /* Cannot turn off usage accounting without turning off limits, or
+        * suspend quotas and simultaneously turn quotas off. */
+       if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+           || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+           DQUOT_USAGE_ENABLED)))
+               return -EINVAL;
+
        /* We need to serialize quota_off() for device */
        mutex_lock(&dqopt->dqonoff_mutex);
 
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
-       if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+       if (!sb_any_quota_loaded(sb)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                toputinode[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
-               /* If we keep inodes of quota files after remount and quotaoff
-                * is called, drop kept inodes. */
-               if (!remount && sb_has_quota_suspended(sb, cnt)) {
-                       iput(dqopt->files[cnt]);
-                       dqopt->files[cnt] = NULL;
-                       reset_enable_flags(dqopt, cnt, 0);
+               if (!sb_has_quota_loaded(sb, cnt))
                        continue;
+
+               if (flags & DQUOT_SUSPENDED) {
+                       dqopt->flags |=
+                               dquot_state_flag(DQUOT_SUSPENDED, cnt);
+               } else {
+                       dqopt->flags &= ~dquot_state_flag(flags, cnt);
+                       /* Turning off suspended quotas? */
+                       if (!sb_has_quota_loaded(sb, cnt) &&
+                           sb_has_quota_suspended(sb, cnt)) {
+                               dqopt->flags &= ~dquot_state_flag(
+                                                       DQUOT_SUSPENDED, cnt);
+                               iput(dqopt->files[cnt]);
+                               dqopt->files[cnt] = NULL;
+                               continue;
+                       }
                }
-               if (!sb_has_quota_enabled(sb, cnt))
+
+               /* We still have to keep quota loaded? */
+               if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;
-               reset_enable_flags(dqopt, cnt, remount);
 
                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                put_quota_format(dqopt->info[cnt].dqi_format);
 
                toputinode[cnt] = dqopt->files[cnt];
-               if (!remount)
+               if (!sb_has_quota_loaded(sb, cnt))
                        dqopt->files[cnt] = NULL;
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                dqopt->ops[cnt] = NULL;
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
+
+       /* Skip syncing and setting flags if quota files are hidden */
+       if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+               goto put_inodes;
+
        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                        mutex_lock(&dqopt->dqonoff_mutex);
                        /* If quota was reenabled in the meantime, we have
                         * nothing to do */
-                       if (!sb_has_quota_enabled(sb, cnt)) {
+                       if (!sb_has_quota_loaded(sb, cnt)) {
                                mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
                                toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
                                  S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                                mark_inode_dirty(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
+               }
+       if (sb->s_bdev)
+               invalidate_bdev(sb->s_bdev);
+put_inodes:
+       for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+               if (toputinode[cnt]) {
                        /* On remount RO, we keep the inode pointer so that we
-                        * can reenable quota on the subsequent remount RW.
-                        * But we have better not keep inode pointer when there
-                        * is pending delete on the quota file... */
-                       if (!remount)
+                        * can reenable quota on the subsequent remount RW. We
+                        * have to check 'flags' variable and not use sb_has_
+                        * function because another quotaon / quotaoff could
+                        * change global state before we got here. We refuse
+                        * to suspend quotas when there is pending delete on
+                        * the quota file... */
+                       if (!(flags & DQUOT_SUSPENDED))
                                iput(toputinode[cnt]);
                        else if (!toputinode[cnt]->i_nlink)
                                ret = -EBUSY;
                }
-       if (sb->s_bdev)
-               invalidate_bdev(sb->s_bdev);
        return ret;
 }
 
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+       return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+                                (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
+
 /*
  *     Turn quotas on on a device
  */
 
-/* Helper function when we already have the inode */
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+       unsigned int flags)
 {
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
                error = -EINVAL;
                goto out_fmt;
        }
+       /* Usage always has to be set... */
+       if (!(flags & DQUOT_USAGE_ENABLED)) {
+               error = -EINVAL;
+               goto out_fmt;
+       }
 
-       /* As we bypass the pagecache we must now flush the inode so that
-        * we see all the changes from userspace... */
-       write_inode_now(inode, 1);
-       /* And now flush the block cache so that kernel sees the changes */
-       invalidate_bdev(sb->s_bdev);
+       if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+               /* As we bypass the pagecache we must now flush the inode so
+                * that we see all the changes from userspace... */
+               write_inode_now(inode, 1);
+               /* And now flush the block cache so that kernel sees the
+                * changes */
+               invalidate_bdev(sb->s_bdev);
+       }
        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
-       if (sb_has_quota_enabled(sb, type) ||
-                       sb_has_quota_suspended(sb, type)) {
+       if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_lock;
        }
-       /* We don't want quota and atime on quota files (deadlocks possible)
-        * Also nobody should write to the file - we use special IO operations
-        * which ignore the immutable bit. */
-       down_write(&dqopt->dqptr_sem);
-       oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-       inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-       up_write(&dqopt->dqptr_sem);
-       sb->dq_op->drop(inode);
+
+       if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+               /* We don't want quota and atime on quota files (deadlocks
+                * possible) Also nobody should write to the file - we use
+                * special IO operations which ignore the immutable bit. */
+               down_write(&dqopt->dqptr_sem);
+               oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+               inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+               up_write(&dqopt->dqptr_sem);
+               sb->dq_op->drop(inode);
+       }
 
        error = -EIO;
        dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
        }
        mutex_unlock(&dqopt->dqio_mutex);
        mutex_unlock(&inode->i_mutex);
-       set_enable_flags(dqopt, type);
+       dqopt->flags |= dquot_state_flag(flags, type);
 
        add_dquot_ref(sb, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
        int ret;
+       unsigned int flags;
 
        mutex_lock(&dqopt->dqonoff_mutex);
        if (!sb_has_quota_suspended(sb, type)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
-       BUG_ON(sb_has_quota_enabled(sb, type));
-
        inode = dqopt->files[type];
        dqopt->files[type] = NULL;
-       reset_enable_flags(dqopt, type, 0);
+       flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                               DQUOT_LIMITS_ENABLED, type);
+       dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
 
-       ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+       flags = dquot_generic_flag(flags, type);
+       ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                                  flags);
        iput(inode);
 
        return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
        if (path->mnt->mnt_sb != sb)
                error = -EXDEV;
        else
-               error = vfs_quota_on_inode(path->dentry->d_inode, type,
-                                          format_id);
+               error = vfs_load_quota_inode(path->dentry->d_inode, type,
+                                            format_id, DQUOT_USAGE_ENABLED |
+                                            DQUOT_LIMITS_ENABLED);
        return error;
 }
 
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
                 int remount)
 {
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 }
 
 /*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+               unsigned int flags)
+{
+       int ret = 0;
+       struct super_block *sb = inode->i_sb;
+       struct quota_info *dqopt = sb_dqopt(sb);
+
+       /* Just unsuspend quotas? */
+       if (flags & DQUOT_SUSPENDED)
+               return vfs_quota_on_remount(sb, type);
+       if (!flags)
+               return 0;
+       /* Just updating flags needed? */
+       if (sb_has_quota_loaded(sb, type)) {
+               mutex_lock(&dqopt->dqonoff_mutex);
+               /* Now do a reliable test... */
+               if (!sb_has_quota_loaded(sb, type)) {
+                       mutex_unlock(&dqopt->dqonoff_mutex);
+                       goto load_quota;
+               }
+               if (flags & DQUOT_USAGE_ENABLED &&
+                   sb_has_quota_usage_enabled(sb, type)) {
+                       ret = -EBUSY;
+                       goto out_lock;
+               }
+               if (flags & DQUOT_LIMITS_ENABLED &&
+                   sb_has_quota_limits_enabled(sb, type)) {
+                       ret = -EBUSY;
+                       goto out_lock;
+               }
+               sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+               mutex_unlock(&dqopt->dqonoff_mutex);
+               return ret;
+       }
+
+load_quota:
+       return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+
+/*
  * This function is used when filesystem needs to initialize quotas
  * during mount time.
  */
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 
        error = security_quota_on(dentry);
        if (!error)
-               error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+               error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+                               DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 out:
        dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
        return ret;
 }
 
+static inline qsize_t qbtos(qsize_t blocks)
+{
+       return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+       return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
 
        spin_lock(&dq_data_lock);
-       di->dqb_bhardlimit = dm->dqb_bhardlimit;
-       di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+       di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+       di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
        di->dqb_curspace = dm->dqb_curspace;
        di->dqb_ihardlimit = dm->dqb_ihardlimit;
        di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,36 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        if (di->dqb_valid & QIF_SPACE) {
                dm->dqb_curspace = di->dqb_curspace;
                check_blim = 1;
+               __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_BLIMITS) {
-               dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
-               dm->dqb_bhardlimit = di->dqb_bhardlimit;
+               dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+               dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
                check_blim = 1;
+               __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_INODES) {
                dm->dqb_curinodes = di->dqb_curinodes;
                check_ilim = 1;
+               __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ILIMITS) {
                dm->dqb_isoftlimit = di->dqb_isoftlimit;
                dm->dqb_ihardlimit = di->dqb_ihardlimit;
                check_ilim = 1;
+               __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-       if (di->dqb_valid & QIF_BTIME)
+       if (di->dqb_valid & QIF_BTIME) {
                dm->dqb_btime = di->dqb_btime;
-       if (di->dqb_valid & QIF_ITIME)
+               __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+       }
+       if (di->dqb_valid & QIF_ITIME) {
                dm->dqb_itime = di->dqb_itime;
+               __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+       }
 
        if (check_blim) {
-               if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+               if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                }
@@ -1970,12 +2129,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
        int rc;
 
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-       if (!(dquot = dqget(sb, id, type))) {
-               mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-               return -ESRCH;
+       dquot = dqget(sb, id, type);
+       if (!dquot) {
+               rc = -ESRCH;
+               goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return rc;
 }
@@ -1986,7 +2147,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        struct mem_dqinfo *mi;
   
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-       if (!sb_has_quota_enabled(sb, type)) {
+       if (!sb_has_quota_active(sb, type)) {
                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
        }
@@ -2005,11 +2166,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
+       int err = 0;
 
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-       if (!sb_has_quota_enabled(sb, type)) {
-               mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-               return -ESRCH;
+       if (!sb_has_quota_active(sb, type)) {
+               err = -ESRCH;
+               goto out;
        }
        mi = sb_dqopt(sb)->info + type;
        spin_lock(&dq_data_lock);
@@ -2023,8 +2185,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mark_info_dirty(sb, type);
        /* Force write to disk */
        sb->dq_op->write_info(sb, type);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-       return 0;
+       return err;
 }
 
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2349,13 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2368,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
index eb3dc4c..7138343 100644 (file)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-       struct file *lower_file = ecryptfs_file_to_lower(file);
-       struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       struct inode *lower_inode = lower_dentry->d_inode;
-       int rc = -EINVAL;
-
-       if (lower_inode->i_fop->fsync) {
-               mutex_lock(&lower_inode->i_mutex);
-               rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
-                                              datasync);
-               mutex_unlock(&lower_inode->i_mutex);
-       }
-       return rc;
+       return vfs_fsync(ecryptfs_file_to_lower(file),
+                        ecryptfs_dentry_to_lower(dentry),
+                        datasync);
 }
 
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
index 5e78fc1..0111906 100644 (file)
@@ -612,8 +612,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
        struct ecryptfs_crypt_stat *crypt_stat;
 
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       if (!lower_dentry->d_inode->i_op ||
-           !lower_dentry->d_inode->i_op->readlink) {
+       if (!lower_dentry->d_inode->i_op->readlink) {
                rc = -EINVAL;
                goto out;
        }
index 3ef9cf9..9c33f54 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
 #include <linux/audit.h>
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
+#include <linux/fsnotify.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (IS_ERR(file))
                goto out;
 
+       fsnotify_open(file->f_path.dentry);
+
        error = -ENOEXEC;
        if(file->f_op) {
                struct linux_binfmt * fmt;
@@ -684,6 +687,8 @@ struct file *open_exec(const char *name)
        if (IS_ERR(file))
                return file;
 
+       fsnotify_open(file->f_path.dentry);
+
        err = deny_write_access(file);
        if (err) {
                fput(file);
index f6c94f2..c22d014 100644 (file)
@@ -713,7 +713,9 @@ static struct dquot_operations ext3_quota_operations = {
        .acquire_dquot  = ext3_acquire_dquot,
        .release_dquot  = ext3_release_dquot,
        .mark_dirty     = ext3_mark_dquot_dirty,
-       .write_info     = ext3_write_info
+       .write_info     = ext3_write_info,
+       .alloc_dquot    = dquot_alloc,
+       .destroy_dquot  = dquot_destroy,
 };
 
 static struct quotactl_ops ext3_qctl_operations = {
@@ -1035,8 +1037,7 @@ static int parse_options (char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                       if ((sb_any_quota_enabled(sb) ||
-                            sb_any_quota_suspended(sb)) &&
+                       if (sb_any_quota_loaded(sb) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                        "EXT3-fs: Cannot change journaled "
@@ -1075,8 +1076,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                       if ((sb_any_quota_enabled(sb) ||
-                            sb_any_quota_suspended(sb)) &&
+                       if (sb_any_quota_loaded(sb) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1095,8 +1095,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                       if ((sb_any_quota_enabled(sb) ||
-                            sb_any_quota_suspended(sb)) &&
+                       if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1115,8 +1114,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                       if (sb_any_quota_enabled(sb) ||
-                           sb_any_quota_suspended(sb)) {
+                       if (sb_any_quota_loaded(sb)) {
                                printk(KERN_ERR "EXT3-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
index 04158ad..9494bb2 100644 (file)
@@ -803,7 +803,9 @@ static struct dquot_operations ext4_quota_operations = {
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
        .mark_dirty     = ext4_mark_dquot_dirty,
-       .write_info     = ext4_write_info
+       .write_info     = ext4_write_info,
+       .alloc_dquot    = dquot_alloc,
+       .destroy_dquot  = dquot_destroy,
 };
 
 static struct quotactl_ops ext4_qctl_operations = {
@@ -1142,8 +1144,7 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                       if ((sb_any_quota_enabled(sb) ||
-                            sb_any_quota_suspended(sb)) &&
+                       if (sb_any_quota_loaded(sb) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                       "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1183,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                       if ((sb_any_quota_enabled(sb) ||
-                            sb_any_quota_suspended(sb)) &&
+                       if (sb_any_quota_loaded(sb) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1202,8 +1202,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                       if ((sb_any_quota_enabled(sb) ||
-                            sb_any_quota_suspended(sb)) &&
+                       if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1222,7 +1221,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                       if (sb_any_quota_enabled(sb)) {
+                       if (sb_any_quota_loaded(sb)) {
                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
index d0e20ce..d488dcd 100644 (file)
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
 module_init(proc_filesystems_init);
 #endif
 
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
 {
        struct file_system_type *fs;
-       const char *dot = strchr(name, '.');
-       unsigned len = dot ? dot - name : strlen(name);
 
        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
-       if (!fs && (request_module("%.*s", len, name) == 0)) {
-               read_lock(&file_systems_lock);
-               fs = *(find_filesystem(name, len));
-               if (fs && !try_module_get(fs->owner))
-                       fs = NULL;
-               read_unlock(&file_systems_lock);
-       }
+       return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+       struct file_system_type *fs;
+       const char *dot = strchr(name, '.');
+       int len = dot ? dot - name : strlen(name);
+
+       fs = __get_fs_type(name, len);
+       if (!fs && (request_module("%.*s", len, name) == 0))
+               fs = __get_fs_type(name, len);
 
        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
index 7d479ce..0ab0c6f 100644 (file)
@@ -506,7 +506,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
-               inode->i_blocks = 0;
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
index 7de1cda..bd48e5e 100644 (file)
@@ -131,6 +131,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_op = &empty_iops;
        inode->i_fop = &empty_fops;
        inode->i_nlink = 1;
+       inode->i_uid = 0;
+       inode->i_gid = 0;
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_blocks = 0;
index 3f8af0f..6147ec3 100644 (file)
@@ -855,10 +855,6 @@ root_found:
        }
        sbi->s_joliet_level = joliet_level;
 
-       /* check the root inode */
-       if (!inode->i_op)
-               goto out_bad_root;
-
        /* Make sure the root inode is a directory */
        if (!S_ISDIR(inode->i_mode)) {
                printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
        /*
         * Display error messages and free resources.
         */
-out_bad_root:
-       printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
 out_iput:
        iput(inode);
        goto out_no_inode;
index ebc667b..c8a1bac 100644 (file)
@@ -509,6 +509,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                if (is_journal_aborted(journal)) {
                        clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
+                       jbd2_buffer_abort_trigger(jh,
+                                                 jh->b_frozen_data ?
+                                                 jh->b_frozen_triggers :
+                                                 jh->b_triggers);
                        jbd2_journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
                         * any descriptor buffers which may have been
@@ -844,6 +848,9 @@ restart_loop:
                 * data.
                 *
                 * Otherwise, we can just throw away the frozen data now.
+                *
+                * We also know that the frozen data has already fired
+                * its triggers if they exist, so we can clear that too.
                 */
                if (jh->b_committed_data) {
                        jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +858,12 @@ restart_loop:
                        if (jh->b_frozen_data) {
                                jh->b_committed_data = jh->b_frozen_data;
                                jh->b_frozen_data = NULL;
+                               jh->b_frozen_triggers = NULL;
                        }
                } else if (jh->b_frozen_data) {
                        jbd2_free(jh->b_frozen_data, bh->b_size);
                        jh->b_frozen_data = NULL;
+                       jh->b_frozen_triggers = NULL;
                }
 
                spin_lock(&journal->j_list_lock);
index e70d657..f6bff9d 100644 (file)
@@ -50,6 +50,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -290,6 +291,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+       struct jbd2_buffer_trigger_type *triggers;
 
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +316,23 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
+               triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+               triggers = jh_in->b_triggers;
        }
 
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
+        * Fire any commit trigger.  Do this before checking for escaping,
+        * as the trigger may modify the magic offset.  If a copy-out
+        * happens afterwards, it will have the correct data in the buffer.
+        */
+       jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+                                  triggers);
+
+       /*
         * Check for escaping
         */
        if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +364,13 @@ repeat:
                new_page = virt_to_page(tmp);
                new_offset = offset_in_page(tmp);
                done_copy_out = 1;
+
+               /*
+                * This isn't strictly necessary, as we're using frozen
+                * data for the escaping, but it keeps consistency with
+                * b_frozen_data usage.
+                */
+               jh_in->b_frozen_triggers = jh_in->b_triggers;
        }
 
        /*
index 39b7805..4f925a4 100644 (file)
@@ -741,6 +741,12 @@ done:
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
+
+               /*
+                * Now that the frozen data is saved off, we need to store
+                * any matching triggers.
+                */
+               jh->b_frozen_triggers = jh->b_triggers;
        }
        jbd_unlock_bh_state(bh);
 
@@ -944,6 +950,47 @@ out:
 }
 
 /**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+                              struct jbd2_buffer_trigger_type *type)
+{
+       struct journal_head *jh = bh2jh(bh);
+
+       jh->b_triggers = type;
+}
+
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+                               struct jbd2_buffer_trigger_type *triggers)
+{
+       struct buffer_head *bh = jh2bh(jh);
+
+       if (!triggers || !triggers->t_commit)
+               return;
+
+       triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+                              struct jbd2_buffer_trigger_type *triggers)
+{
+       if (!triggers || !triggers->t_abort)
+               return;
+
+       triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
+/**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @bh: buffer to mark
index d6363d8..0f94381 100644 (file)
@@ -58,9 +58,9 @@
 
 /*
  * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
  */
-static HLIST_HEAD(aggregate_hash);
 
 /*
  * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
        /* release the page */
        release_metapage(mp);
 
-       hlist_add_head(&ip->i_hash, &aggregate_hash);
+       /*
+        * that will look hashed, but won't be on any list; hlist_del()
+        * will work fine and require no locking.
+        */
+       ip->i_hash.pprev = &ip->i_hash.next;
 
        return (ip);
 }
index bdaec17..49b4409 100644 (file)
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-       root->i_uid = root->i_gid = 0;
        root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
        dentry = d_alloc(NULL, &d_name);
        if (!dentry) {
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
-       inode->i_uid = inode->i_gid = 0;
-       inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
                if (!inode)
                        goto out;
                inode->i_mode = S_IFREG | files->mode;
-               inode->i_uid = inode->i_gid = 0;
-               inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_fop = files->ops;
                inode->i_ino = i;
index df2d3df..f05bed2 100644 (file)
@@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask)
                        return -EACCES;
        }
 
-       if (inode->i_op && inode->i_op->permission)
+       if (inode->i_op->permission)
                retval = inode->i_op->permission(inode, mask);
        else
                retval = generic_permission(inode, mask, NULL);
@@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
 {
        umode_t mode = inode->i_mode;
 
-       if (inode->i_op && inode->i_op->permission)
+       if (inode->i_op->permission)
                return -EAGAIN;
 
        if (current_fsuid() == inode->i_uid)
@@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
-               err = -ENOTDIR; 
-               if (!inode->i_op)
-                       goto out_dput;
 
                if (inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
@@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        inode = nd->path.dentry->d_inode;
                        if (!inode)
                                break;
-                       err = -ENOTDIR; 
-                       if (!inode->i_op)
-                               break;
                } else
                        path_to_nameidata(&next, nd);
                err = -ENOTDIR; 
@@ -961,7 +955,7 @@ last_component:
                        break;
                inode = next.dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
-                   && inode && inode->i_op && inode->i_op->follow_link) {
+                   && inode && inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
@@ -973,7 +967,7 @@ last_component:
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                       if (!inode->i_op || !inode->i_op->lookup)
+                       if (!inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if (error)
                return error;
 
-       if (!dir->i_op || !dir->i_op->create)
+       if (!dir->i_op->create)
                return -EACCES; /* shouldn't it be ENOSYS? */
        mode &= S_IALLUGO;
        mode |= S_IFREG;
@@ -1752,7 +1746,7 @@ do_last:
        error = -ENOENT;
        if (!path.dentry->d_inode)
                goto exit_dput;
-       if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+       if (path.dentry->d_inode->i_op->follow_link)
                goto do_link;
 
        path_to_nameidata(&path, &nd);
@@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
                return -EPERM;
 
-       if (!dir->i_op || !dir->i_op->mknod)
+       if (!dir->i_op->mknod)
                return -EPERM;
 
        error = devcgroup_inode_mknod(mode, dev);
@@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                return error;
 
-       if (!dir->i_op || !dir->i_op->mkdir)
+       if (!dir->i_op->mkdir)
                return -EPERM;
 
        mode &= (S_IRWXUGO|S_ISVTX);
@@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
 
-       if (!dir->i_op || !dir->i_op->rmdir)
+       if (!dir->i_op->rmdir)
                return -EPERM;
 
        DQUOT_INIT(dir);
@@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
 
-       if (!dir->i_op || !dir->i_op->unlink)
+       if (!dir->i_op->unlink)
                return -EPERM;
 
        DQUOT_INIT(dir);
@@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
        if (error)
                return error;
 
-       if (!dir->i_op || !dir->i_op->symlink)
+       if (!dir->i_op->symlink)
                return -EPERM;
 
        error = security_inode_symlink(dir, dentry, oldname);
@@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
-       if (!dir->i_op || !dir->i_op->link)
+       if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
@@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
 
-       if (!old_dir->i_op || !old_dir->i_op->rename)
+       if (!old_dir->i_op->rename)
                return -EPERM;
 
        DQUOT_INIT(old_dir);
index d1c5f78..44aa92a 100644 (file)
@@ -744,45 +744,16 @@ nfsd_close(struct file *filp)
        fput(filp);
 }
 
-/*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-                             const struct file_operations *fop)
-{
-       struct inode *inode = dp->d_inode;
-       int (*fsync) (struct file *, struct dentry *, int);
-       int err;
-
-       err = filemap_fdatawrite(inode->i_mapping);
-       if (err == 0 && fop && (fsync = fop->fsync))
-               err = fsync(filp, dp, 0);
-       if (err == 0)
-               err = filemap_fdatawait(inode->i_mapping);
-
-       return err;
-}
-       
-
 static int
 nfsd_sync(struct file *filp)
 {
-        int err;
-       struct inode *inode = filp->f_path.dentry->d_inode;
-       dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
-       mutex_lock(&inode->i_mutex);
-       err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
-       mutex_unlock(&inode->i_mutex);
-
-       return err;
+       return vfs_fsync(filp, filp->f_path.dentry, 0);
 }
 
 int
-nfsd_sync_dir(struct dentry *dp)
+nfsd_sync_dir(struct dentry *dentry)
 {
-       return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
+       return vfs_fsync(NULL, dentry, 0);
 }
 
 /*
@@ -1211,7 +1182,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        dirp = dentry->d_inode;
 
        err = nfserr_notdir;
-       if(!dirp->i_op || !dirp->i_op->lookup)
+       if (!dirp->i_op->lookup)
                goto out;
        /*
         * Check whether the response file handle has been verified yet.
@@ -1347,7 +1318,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        /* Get all the sanity checks out of the way before
         * we lock the parent. */
        err = nfserr_notdir;
-       if(!dirp->i_op || !dirp->i_op->lookup)
+       if (!dirp->i_op->lookup)
                goto out;
        fh_lock_nested(fhp, I_MUTEX_PARENT);
 
@@ -1482,7 +1453,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
        inode = dentry->d_inode;
 
        err = nfserr_inval;
-       if (!inode->i_op || !inode->i_op->readlink)
+       if (!inode->i_op->readlink)
                goto out;
 
        touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2133,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
        size_t size;
        int error;
 
-       if (!IS_POSIXACL(inode) || !inode->i_op ||
+       if (!IS_POSIXACL(inode) ||
            !inode->i_op->setxattr || !inode->i_op->removexattr)
                return -EOPNOTSUPP;
        switch(type) {
index 400f806..81b8644 100644 (file)
@@ -704,7 +704,7 @@ fput_and_out:
        return ret;
 }
 
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
 {
        struct file *filp;
        struct inotify_device *dev;
index e9da092..86bef15 100644 (file)
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                ni->allocated_size = sle64_to_cpu(
                                a->data.non_resident.allocated_size);
        }
-       /* Setup the operations for this attribute inode. */
-       vi->i_op = NULL;
-       vi->i_fop = NULL;
        if (NInoMstProtected(ni))
                vi->i_mapping->a_ops = &ntfs_mst_aops;
        else
index 589dcdf..0159607 100644 (file)
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
        aops.o                  \
+       blockcheck.o            \
        buffer_head_io.o        \
        dcache.o                \
        dir.o                   \
@@ -35,8 +36,14 @@ ocfs2-objs := \
        sysfile.o               \
        uptodate.o              \
        ver.o                   \
+       quota_local.o           \
+       quota_global.o          \
        xattr.o
 
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
+
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644 (file)
index 0000000..12dfb44
--- /dev/null
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+       int n, count;
+       struct posix_acl *acl;
+
+       if (!value)
+               return NULL;
+       if (size < sizeof(struct posix_acl_entry))
+               return ERR_PTR(-EINVAL);
+
+       count = size / sizeof(struct posix_acl_entry);
+       if (count < 0)
+               return ERR_PTR(-EINVAL);
+       if (count == 0)
+               return NULL;
+
+       acl = posix_acl_alloc(count, GFP_NOFS);
+       if (!acl)
+               return ERR_PTR(-ENOMEM);
+       for (n = 0; n < count; n++) {
+               struct ocfs2_acl_entry *entry =
+                       (struct ocfs2_acl_entry *)value;
+
+               acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+               acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+               acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+               value += sizeof(struct posix_acl_entry);
+
+       }
+       return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+       struct ocfs2_acl_entry *entry = NULL;
+       char *ocfs2_acl;
+       size_t n;
+
+       *size = acl->a_count * sizeof(struct posix_acl_entry);
+
+       ocfs2_acl = kmalloc(*size, GFP_NOFS);
+       if (!ocfs2_acl)
+               return ERR_PTR(-ENOMEM);
+
+       entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+       for (n = 0; n < acl->a_count; n++, entry++) {
+               entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+               entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+               entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+       }
+       return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+                                             int type,
+                                             struct buffer_head *di_bh)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int name_index;
+       char *value = NULL;
+       struct posix_acl *acl;
+       int retval;
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return NULL;
+
+       switch (type) {
+       case ACL_TYPE_ACCESS:
+               name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+               break;
+       case ACL_TYPE_DEFAULT:
+               name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
+
+       retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+       if (retval > 0) {
+               value = kmalloc(retval, GFP_NOFS);
+               if (!value)
+                       return ERR_PTR(-ENOMEM);
+               retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                               "", value, retval);
+       }
+
+       if (retval > 0)
+               acl = ocfs2_acl_from_xattr(value, retval);
+       else if (retval == -ENODATA || retval == 0)
+               acl = NULL;
+       else
+               acl = ERR_PTR(retval);
+
+       kfree(value);
+
+       return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct buffer_head *di_bh = NULL;
+       struct posix_acl *acl;
+       int ret;
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return NULL;
+
+       ret = ocfs2_inode_lock(inode, &di_bh, 0);
+       if (ret < 0) {
+               mlog_errno(ret);
+               acl = ERR_PTR(ret);
+               return acl;
+       }
+
+       acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+       ocfs2_inode_unlock(inode, 0);
+
+       brelse(di_bh);
+
+       return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+                        struct inode *inode,
+                        struct buffer_head *di_bh,
+                        int type,
+                        struct posix_acl *acl,
+                        struct ocfs2_alloc_context *meta_ac,
+                        struct ocfs2_alloc_context *data_ac)
+{
+       int name_index;
+       void *value = NULL;
+       size_t size = 0;
+       int ret;
+
+       if (S_ISLNK(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       switch (type) {
+       case ACL_TYPE_ACCESS:
+               name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+               if (acl) {
+                       mode_t mode = inode->i_mode;
+                       ret = posix_acl_equiv_mode(acl, &mode);
+                       if (ret < 0)
+                               return ret;
+                       else {
+                               inode->i_mode = mode;
+                               if (ret == 0)
+                                       acl = NULL;
+                       }
+               }
+               break;
+       case ACL_TYPE_DEFAULT:
+               name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+               if (!S_ISDIR(inode->i_mode))
+                       return acl ? -EACCES : 0;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (acl) {
+               value = ocfs2_acl_to_xattr(acl, &size);
+               if (IS_ERR(value))
+                       return (int)PTR_ERR(value);
+       }
+
+       if (handle)
+               ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+                                            "", value, size, 0,
+                                            meta_ac, data_ac);
+       else
+               ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+       kfree(value);
+
+       return ret;
+}
+
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+       struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+
+       if (IS_ERR(acl))
+               return PTR_ERR(acl);
+       if (acl) {
+               int ret = posix_acl_permission(inode, acl, mask);
+               posix_acl_release(acl);
+               return ret;
+       }
+
+       return -EAGAIN;
+}
+
+int ocfs2_acl_chmod(struct inode *inode)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct posix_acl *acl, *clone;
+       int ret;
+
+       if (S_ISLNK(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return 0;
+
+       acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+       if (IS_ERR(acl) || !acl)
+               return PTR_ERR(acl);
+       clone = posix_acl_clone(acl, GFP_KERNEL);
+       posix_acl_release(acl);
+       if (!clone)
+               return -ENOMEM;
+       ret = posix_acl_chmod_masq(clone, inode->i_mode);
+       if (!ret)
+               ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+                                   clone, NULL, NULL);
+       posix_acl_release(clone);
+       return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+                  struct inode *inode,
+                  struct inode *dir,
+                  struct buffer_head *di_bh,
+                  struct buffer_head *dir_bh,
+                  struct ocfs2_alloc_context *meta_ac,
+                  struct ocfs2_alloc_context *data_ac)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct posix_acl *acl = NULL;
+       int ret = 0;
+
+       if (!S_ISLNK(inode->i_mode)) {
+               if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                       acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+                                                  dir_bh);
+                       if (IS_ERR(acl))
+                               return PTR_ERR(acl);
+               }
+               if (!acl)
+                       inode->i_mode &= ~current->fs->umask;
+       }
+       if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+               struct posix_acl *clone;
+               mode_t mode;
+
+               if (S_ISDIR(inode->i_mode)) {
+                       ret = ocfs2_set_acl(handle, inode, di_bh,
+                                           ACL_TYPE_DEFAULT, acl,
+                                           meta_ac, data_ac);
+                       if (ret)
+                               goto cleanup;
+               }
+               clone = posix_acl_clone(acl, GFP_NOFS);
+               ret = -ENOMEM;
+               if (!clone)
+                       goto cleanup;
+
+               mode = inode->i_mode;
+               ret = posix_acl_create_masq(clone, &mode);
+               if (ret >= 0) {
+                       inode->i_mode = mode;
+                       if (ret > 0) {
+                               ret = ocfs2_set_acl(handle, inode,
+                                                   di_bh, ACL_TYPE_ACCESS,
+                                                   clone, meta_ac, data_ac);
+                       }
+               }
+               posix_acl_release(clone);
+       }
+cleanup:
+       posix_acl_release(acl);
+       return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+                                         char *list,
+                                         size_t list_len,
+                                         const char *name,
+                                         size_t name_len)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return 0;
+
+       if (list && size <= list_len)
+               memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+       return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+                                          char *list,
+                                          size_t list_len,
+                                          const char *name,
+                                          size_t name_len)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return 0;
+
+       if (list && size <= list_len)
+               memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+       return size;
+}
+
+static int ocfs2_xattr_get_acl(struct inode *inode,
+                              int type,
+                              void *buffer,
+                              size_t size)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct posix_acl *acl;
+       int ret;
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return -EOPNOTSUPP;
+
+       acl = ocfs2_get_acl(inode, type);
+       if (IS_ERR(acl))
+               return PTR_ERR(acl);
+       if (acl == NULL)
+               return -ENODATA;
+       ret = posix_acl_to_xattr(acl, buffer, size);
+       posix_acl_release(acl);
+
+       return ret;
+}
+
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+                                     const char *name,
+                                     void *buffer,
+                                     size_t size)
+{
+       if (strcmp(name, "") != 0)
+               return -EINVAL;
+       return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+       if (strcmp(name, "") != 0)
+               return -EINVAL;
+       return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int ocfs2_xattr_set_acl(struct inode *inode,
+                              int type,
+                              const void *value,
+                              size_t size)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct posix_acl *acl;
+       int ret = 0;
+
+       if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+               return -EOPNOTSUPP;
+
+       if (!is_owner_or_cap(inode))
+               return -EPERM;
+
+       if (value) {
+               acl = posix_acl_from_xattr(value, size);
+               if (IS_ERR(acl))
+                       return PTR_ERR(acl);
+               else if (acl) {
+                       ret = posix_acl_valid(acl);
+                       if (ret)
+                               goto cleanup;
+               }
+       } else
+               acl = NULL;
+
+       ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+       posix_acl_release(acl);
+       return ret;
+}
+
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+                                     const char *name,
+                                     const void *value,
+                                     size_t size,
+                                     int flags)
+{
+       if (strcmp(name, "") != 0)
+               return -EINVAL;
+       return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+                                      const char *name,
+                                      const void *value,
+                                      size_t size,
+                                      int flags)
+{
+       if (strcmp(name, "") != 0)
+               return -EINVAL;
+       return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+       .prefix = POSIX_ACL_XATTR_ACCESS,
+       .list   = ocfs2_xattr_list_acl_access,
+       .get    = ocfs2_xattr_get_acl_access,
+       .set    = ocfs2_xattr_set_acl_access,
+};
+
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+       .prefix = POSIX_ACL_XATTR_DEFAULT,
+       .list   = ocfs2_xattr_list_acl_default,
+       .get    = ocfs2_xattr_get_acl_default,
+       .set    = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644 (file)
index 0000000..8f6389e
--- /dev/null
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+       __le16 e_tag;
+       __le16 e_perm;
+       __le32 e_id;
+};
+
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+                         struct buffer_head *, struct buffer_head *,
+                         struct ocfs2_alloc_context *,
+                         struct ocfs2_alloc_context *);
+
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+       return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+                                struct inode *inode,
+                                struct inode *dir,
+                                struct buffer_head *di_bh,
+                                struct buffer_head *dir_bh,
+                                struct ocfs2_alloc_context *meta_ac,
+                                struct ocfs2_alloc_context *data_ac)
+{
+       return 0;
+}
+
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#endif /* OCFS2_ACL_H */
index 0cc2deb..54ff4c7 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -36,6 +37,7 @@
 
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -46,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
                                     struct ocfs2_extent_tree *et)
 {
-       int ret = 0;
-       struct ocfs2_dinode *di;
+       struct ocfs2_dinode *di = et->et_object;
 
        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+       BUG_ON(!OCFS2_IS_VALID_DINODE(di));
 
-       di = et->et_object;
-       if (!OCFS2_IS_VALID_DINODE(di)) {
-               ret = -EIO;
-               ocfs2_error(inode->i_sb,
-                       "Inode %llu has invalid path root",
-                       (unsigned long long)OCFS2_I(inode)->ip_blkno);
-       }
-
-       return ret;
+       return 0;
 }
 
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-       struct ocfs2_xattr_value_root *xv = et->et_object;
+       struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-       et->et_root_el = &xv->xr_list;
+       et->et_root_el = &vb->vb_xv->xr_list;
 }
 
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                              u64 blkno)
 {
-       struct ocfs2_xattr_value_root *xv =
-               (struct ocfs2_xattr_value_root *)et->et_object;
+       struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-       xv->xr_last_eb_blk = cpu_to_le64(blkno);
+       vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-       struct ocfs2_xattr_value_root *xv =
-               (struct ocfs2_xattr_value_root *) et->et_object;
+       struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-       return le64_to_cpu(xv->xr_last_eb_blk);
+       return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
                                              struct ocfs2_extent_tree *et,
                                              u32 clusters)
 {
-       struct ocfs2_xattr_value_root *xv =
-               (struct ocfs2_xattr_value_root *)et->et_object;
+       struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-       le32_add_cpu(&xv->xr_clusters, clusters);
+       le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
+                                    ocfs2_journal_access_func access,
                                     void *obj,
                                     struct ocfs2_extent_tree_operations *ops)
 {
        et->et_ops = ops;
        et->et_root_bh = bh;
+       et->et_root_journal_access = access;
        if (!obj)
                obj = (void *)bh->b_data;
        et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
                                   struct inode *inode,
                                   struct buffer_head *bh)
 {
-       __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+       __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+                                NULL, &ocfs2_dinode_et_ops);
 }
 
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh)
 {
-       __ocfs2_init_extent_tree(et, inode, bh, NULL,
-                                &ocfs2_xattr_tree_et_ops);
+       __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+                                NULL, &ocfs2_xattr_tree_et_ops);
 }
 
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                       struct buffer_head *bh,
-                                       struct ocfs2_xattr_value_root *xv)
+                                       struct ocfs2_xattr_value_buf *vb)
 {
-       __ocfs2_init_extent_tree(et, inode, bh, xv,
+       __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
                                 &ocfs2_xattr_value_et_ops);
 }
 
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
        et->et_ops->eo_update_clusters(inode, et, clusters);
 }
 
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+                                              struct inode *inode,
+                                              struct ocfs2_extent_tree *et,
+                                              int type)
+{
+       return et->et_root_journal_access(handle, inode, et->et_root_bh,
+                                         type);
+}
+
 static inline int ocfs2_et_insert_check(struct inode *inode,
                                        struct ocfs2_extent_tree *et,
                                        struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH   5
 
 struct ocfs2_path {
-       int                     p_tree_depth;
-       struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
+       int                             p_tree_depth;
+       ocfs2_journal_access_func       p_root_access;
+       struct ocfs2_path_item          p_node[OCFS2_MAX_PATH_DEPTH];
 };
 
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
         */
        if (keep_root)
                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+       else
+               path_root_access(path) = NULL;
 
        path->p_tree_depth = depth;
 }
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
        BUG_ON(path_root_el(dest) != path_root_el(src));
+       BUG_ON(path_root_access(dest) != path_root_access(src));
 
        ocfs2_reinit_path(dest, 1);
 
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        int i;
 
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+       BUG_ON(path_root_access(dest) != path_root_access(src));
 
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
                brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-                                        struct ocfs2_extent_list *root_el)
+                                        struct ocfs2_extent_list *root_el,
+                                        ocfs2_journal_access_func access)
 {
        struct ocfs2_path *path;
 
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
                get_bh(root_bh);
                path_root_bh(path) = root_bh;
                path_root_el(path) = root_el;
+               path_root_access(path) = access;
        }
 
        return path;
 }
 
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+       return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+                             path_root_access(path));
+}
+
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+       return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+                             et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+                                       struct inode *inode,
+                                       struct ocfs2_path *path,
+                                       int idx)
+{
+       ocfs2_journal_access_func access = path_root_access(path);
+
+       if (!access)
+               access = ocfs2_journal_access;
+
+       if (idx)
+               access = ocfs2_journal_access_eb;
+
+       return access(handle, inode, path->p_node[idx].bh,
+                     OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
 /*
  * Convenience function to journal all components in a path.
  */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
                goto out;
 
        for(i = 0; i < path_num_items(path); i++) {
-               ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
        int                     c_split_covers_rec;
 };
 
+static int ocfs2_validate_extent_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+       int rc;
+       struct ocfs2_extent_block *eb =
+               (struct ocfs2_extent_block *)bh->b_data;
+
+       mlog(0, "Validating extent block %llu\n",
+            (unsigned long long)bh->b_blocknr);
+
+       BUG_ON(!buffer_uptodate(bh));
+
+       /*
+        * If the ecc fails, we return the error but otherwise
+        * leave the filesystem running.  We know any error is
+        * local to this block.
+        */
+       rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+       if (rc) {
+               mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+                    (unsigned long long)bh->b_blocknr);
+               return rc;
+       }
+
+       /*
+        * Errors after here are fatal.
+        */
+
+       if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+               ocfs2_error(sb,
+                           "Extent block #%llu has bad signature %.*s",
+                           (unsigned long long)bh->b_blocknr, 7,
+                           eb->h_signature);
+               return -EINVAL;
+       }
+
+       if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+               ocfs2_error(sb,
+                           "Extent block #%llu has an invalid h_blkno "
+                           "of %llu",
+                           (unsigned long long)bh->b_blocknr,
+                           (unsigned long long)le64_to_cpu(eb->h_blkno));
+               return -EINVAL;
+       }
+
+       if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+               ocfs2_error(sb,
+                           "Extent block #%llu has an invalid "
+                           "h_fs_generation of #%u",
+                           (unsigned long long)bh->b_blocknr,
+                           le32_to_cpu(eb->h_fs_generation));
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                           struct buffer_head **bh)
+{
+       int rc;
+       struct buffer_head *tmp = *bh;
+
+       rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+                             ocfs2_validate_extent_block);
+
+       /* If ocfs2_read_block() got us a new bh, pass it up. */
+       if (!rc && !*bh)
+               *bh = tmp;
+
+       return rc;
+}
+
+
 /*
  * How many free extents have we got before we need more meta data?
  */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
        if (last_eb_blk) {
-               retval = ocfs2_read_block(inode, last_eb_blk,
-                                         &eb_bh);
+               retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
                if (retval < 0) {
                        mlog_errno(retval);
                        goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        }
                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
 
-                       status = ocfs2_journal_access(handle, inode, bhs[i],
-                                                     OCFS2_JOURNAL_ACCESS_CREATE);
+                       status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+                                                        OCFS2_JOURNAL_ACCESS_CREATE);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        for(i = 0; i < new_blocks; i++) {
                bh = new_eb_bhs[i];
                eb = (struct ocfs2_extent_block *) bh->b_data;
-               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                       status = -EIO;
-                       goto bail;
-               }
+               /* ocfs2_create_new_meta_bhs() should create it right! */
+               BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
                eb_el = &eb->h_list;
 
-               status = ocfs2_journal_access(handle, inode, bh,
-                                             OCFS2_JOURNAL_ACCESS_CREATE);
+               status = ocfs2_journal_access_eb(handle, inode, bh,
+                                                OCFS2_JOURNAL_ACCESS_CREATE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-       status = ocfs2_journal_access(handle, inode, *last_eb_bh,
-                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-       status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       status = ocfs2_et_root_journal_access(handle, inode, et,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        if (eb_bh) {
-               status = ocfs2_journal_access(handle, inode, eb_bh,
-                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        }
 
        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-       if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-               status = -EIO;
-               goto bail;
-       }
+       /* ocfs2_create_new_meta_bhs() should create it right! */
+       BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 
        eb_el = &eb->h_list;
        root_el = et->et_root_el;
 
-       status = ocfs2_journal_access(handle, inode, new_eb_bh,
-                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+                                        OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
 
-       status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       status = ocfs2_et_root_journal_access(handle, inode, et,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                brelse(bh);
                bh = NULL;
 
-               status = ocfs2_read_block(inode, blkno, &bh);
+               status = ocfs2_read_extent_block(inode, blkno, &bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
 
                eb = (struct ocfs2_extent_block *) bh->b_data;
-               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                       status = -EIO;
-                       goto bail;
-               }
                el = &eb->h_list;
 
                if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
                brelse(bh);
                bh = NULL;
-               ret = ocfs2_read_block(inode, blkno, &bh);
+               ret = ocfs2_read_extent_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
 
                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
-               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                       ret = -EIO;
-                       goto out;
-               }
 
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
        root_bh = left_path->p_node[subtree_index].bh;
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-       ret = ocfs2_journal_access(handle, inode, root_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+                                          subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-               ret = ocfs2_journal_access(handle, inode,
-                                          right_path->p_node[i].bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                  right_path, i);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
 
-               ret = ocfs2_journal_access(handle, inode,
-                                          left_path->p_node[i].bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                  left_path, i);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
 
        *ret_left_path = NULL;
 
-       left_path = ocfs2_new_path(path_root_bh(right_path),
-                                  path_root_el(right_path));
+       left_path = ocfs2_new_path_from_path(right_path);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                        return -EAGAIN;
 
                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-                       ret = ocfs2_journal_access(handle, inode,
-                                                  path_leaf_bh(right_path),
-                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       ret = ocfs2_journal_access_eb(handle, inode,
+                                                     path_leaf_bh(right_path),
+                                                     OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                 * We have to update i_last_eb_blk during the meta
                 * data delete.
                 */
-               ret = ocfs2_journal_access(handle, inode, et_root_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
         */
        BUG_ON(right_has_empty && !del_right_subtree);
 
-       ret = ocfs2_journal_access(handle, inode, root_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+                                          subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-               ret = ocfs2_journal_access(handle, inode,
-                                          right_path->p_node[i].bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                  right_path, i);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
 
-               ret = ocfs2_journal_access(handle, inode,
-                                          left_path->p_node[i].bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                  left_path, i);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2596,16 +2694,17 @@ out:
 
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
                                            handle_t *handle,
-                                           struct buffer_head *bh,
-                                           struct ocfs2_extent_list *el)
+                                           struct ocfs2_path *path)
 {
        int ret;
+       struct buffer_head *bh = path_leaf_bh(path);
+       struct ocfs2_extent_list *el = path_leaf_el(path);
 
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
                return 0;
 
-       ret = ocfs2_journal_access(handle, inode, bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_path_bh_journal_access(handle, inode, path,
+                                          path_num_items(path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                goto out;
        }
 
-       left_path = ocfs2_new_path(path_root_bh(path),
-                                  path_root_el(path));
+       left_path = ocfs2_new_path_from_path(path);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
        ocfs2_cp_path(left_path, path);
 
-       right_path = ocfs2_new_path(path_root_bh(path),
-                                   path_root_el(path));
+       right_path = ocfs2_new_path_from_path(path);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                 * Caller might still want to make changes to the
                 * tree root, so re-add it to the journal here.
                 */
-               ret = ocfs2_journal_access(handle, inode,
-                                          path_root_bh(left_path),
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                  left_path, 0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                 * We have a path to the left of this one - it needs
                 * an update too.
                 */
-               left_path = ocfs2_new_path(path_root_bh(path),
-                                          path_root_el(path));
+               left_path = ocfs2_new_path_from_path(path);
                if (!left_path) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
                 * it up front.
                 */
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-                                                      path_leaf_bh(path),
-                                                      path_leaf_el(path));
+                                                      path);
                if (ret)
                        mlog_errno(ret);
                goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
        /* This function shouldn't be called for the rightmost leaf. */
        BUG_ON(right_cpos == 0);
 
-       right_path = ocfs2_new_path(path_root_bh(left_path),
-                                   path_root_el(left_path));
+       right_path = ocfs2_new_path_from_path(left_path);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-               ret = ocfs2_journal_access(handle, inode, root_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+                                                  subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                       ret = ocfs2_journal_access(handle, inode,
-                                                  right_path->p_node[i].bh,
-                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                          right_path, i);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
 
-                       ret = ocfs2_journal_access(handle, inode,
-                                                  left_path->p_node[i].bh,
-                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                          left_path, i);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                right_rec = &el->l_recs[index + 1];
        }
 
-       ret = ocfs2_journal_access(handle, inode, bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+                                          path_num_items(left_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
        /* This function shouldn't be called for the leftmost leaf. */
        BUG_ON(left_cpos == 0);
 
-       left_path = ocfs2_new_path(path_root_bh(right_path),
-                                  path_root_el(right_path));
+       left_path = ocfs2_new_path_from_path(right_path);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-               ret = ocfs2_journal_access(handle, inode, root_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+                                                  subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                       ret = ocfs2_journal_access(handle, inode,
-                                                  right_path->p_node[i].bh,
-                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                          right_path, i);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
 
-                       ret = ocfs2_journal_access(handle, inode,
-                                                  left_path->p_node[i].bh,
-                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       ret = ocfs2_path_bh_journal_access(handle, inode,
+                                                          left_path, i);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                        has_empty_extent = 1;
        }
 
-       ret = ocfs2_journal_access(handle, inode, bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+                                          path_num_items(right_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                 * leftmost leaf.
                 */
                if (left_cpos) {
-                       left_path = ocfs2_new_path(path_root_bh(right_path),
-                                                  path_root_el(right_path));
+                       left_path = ocfs2_new_path_from_path(right_path);
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
        el = et->et_root_el;
 
-       ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                goto out_update_clusters;
        }
 
-       right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+       right_path = ocfs2_new_path_from_et(et);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                 * ocfs2_rotate_tree_right() might have extended the
                 * transaction without re-journaling our tree root.
                 */
-               ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        goto out;
 
                if (left_cpos != 0) {
-                       left_path = ocfs2_new_path(path_root_bh(path),
-                                                  path_root_el(path));
+                       left_path = ocfs2_new_path_from_path(path);
                        if (!left_path)
                                goto out;
 
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                            le16_to_cpu(new_el->l_count)) {
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-                                                                eb);
+                               ocfs2_error(inode->i_sb,
+                                           "Extent block #%llu has an "
+                                           "invalid l_next_free_rec of "
+                                           "%d.  It should have "
+                                           "matched the l_count of %d",
+                                           (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                           le16_to_cpu(new_el->l_next_free_rec),
+                                           le16_to_cpu(new_el->l_count));
+                               status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                if (right_cpos == 0)
                        goto out;
 
-               right_path = ocfs2_new_path(path_root_bh(path),
-                                           path_root_el(path));
+               right_path = ocfs2_new_path_from_path(path);
                if (!right_path)
                        goto out;
 
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-                                                                eb);
+                               ocfs2_error(inode->i_sb,
+                                           "Extent block #%llu has an "
+                                           "invalid l_next_free_rec of %d",
+                                           (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                           le16_to_cpu(new_el->l_next_free_rec));
+                               status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
                 * may want it later.
                 */
-               ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+               ret = ocfs2_read_extent_block(inode,
+                                             ocfs2_et_get_last_eb_blk(et),
+                                             &bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                return 0;
        }
 
-       path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+       path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 
        BUG_ON(num_bits > clusters_to_add);
 
-       /* reserve our write early -- insert_extent may update the inode */
-       status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       /* reserve our write early -- insert_extent may update the tree root */
+       status = ocfs2_et_root_journal_access(handle, inode, et,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        if (path->p_tree_depth) {
                struct ocfs2_extent_block *eb;
 
-               ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-                                      &last_eb_bh);
+               ret = ocfs2_read_extent_block(inode,
+                                             ocfs2_et_get_last_eb_blk(et),
+                                             &last_eb_bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
                }
 
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                       ret = -EROFS;
-                       goto out;
-               }
-
                rightmost_el = &eb->h_list;
        } else
                rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
        if (et->et_ops == &ocfs2_dinode_et_ops)
                ocfs2_extent_map_trunc(inode, 0);
 
-       left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+       left_path = ocfs2_new_path_from_et(et);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 
        depth = path->p_tree_depth;
        if (depth > 0) {
-               ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-                                      &last_eb_bh);
+               ret = ocfs2_read_extent_block(inode,
+                                             ocfs2_et_get_last_eb_blk(et),
+                                             &last_eb_bh);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
                }
 
                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-                       left_path = ocfs2_new_path(path_root_bh(path),
-                                                  path_root_el(path));
+                       left_path = ocfs2_new_path_from_path(path);
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
 
        ocfs2_extent_map_trunc(inode, 0);
 
-       path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+       path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
        return ret;
 }
 
+int ocfs2_remove_btree_range(struct inode *inode,
+                            struct ocfs2_extent_tree *et,
+                            u32 cpos, u32 phys_cpos, u32 len,
+                            struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       int ret;
+       u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct inode *tl_inode = osb->osb_tl_inode;
+       handle_t *handle;
+       struct ocfs2_alloc_context *meta_ac = NULL;
+
+       ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+       if (ret) {
+               mlog_errno(ret);
+               return ret;
+       }
+
+       mutex_lock(&tl_inode->i_mutex);
+
+       if (ocfs2_truncate_log_needs_flush(osb)) {
+               ret = __ocfs2_flush_truncate_log(osb);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+                                 dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       ocfs2_et_update_clusters(inode, et, -len);
+
+       ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+       if (ret)
+               mlog_errno(ret);
+
+out_commit:
+       ocfs2_commit_trans(osb, handle);
+out:
+       mutex_unlock(&tl_inode->i_mutex);
+
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+
+       return ret;
+}
+
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-       tl = &di->id2.i_dealloc;
-       if (!OCFS2_IS_VALID_DINODE(di)) {
-               OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-               status = -EIO;
-               goto bail;
-       }
 
+       /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+        * by the underlying call to ocfs2_read_inode_block(), so any
+        * corruption is a code bug */
+       BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+       tl = &di->id2.i_dealloc;
        tl_count = le16_to_cpu(tl->tl_count);
        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
                        tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                goto bail;
        }
 
-       status = ocfs2_journal_access(handle, tl_inode, tl_bh,
-                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        while (i >= 0) {
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
-               status = ocfs2_journal_access(handle, tl_inode, tl_bh,
-                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-       tl = &di->id2.i_dealloc;
-       if (!OCFS2_IS_VALID_DINODE(di)) {
-               OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-               status = -EIO;
-               goto out;
-       }
 
+       /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+        * by the underlying call to ocfs2_read_inode_block(), so any
+        * corruption is a code bug */
+       BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+       tl = &di->id2.i_dealloc;
        num_to_flush = le16_to_cpu(tl->tl_used);
        mlog(0, "Flush %u records from truncate log #%llu\n",
             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
                goto bail;
        }
 
-       status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+       status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                iput(inode);
                mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        }
 
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-       tl = &di->id2.i_dealloc;
-       if (!OCFS2_IS_VALID_DINODE(di)) {
-               OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-               status = -EIO;
-               goto bail;
-       }
 
+       /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+        * validated by the underlying call to ocfs2_read_inode_block(),
+        * so any corruption is a code bug */
+       BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+       tl = &di->id2.i_dealloc;
        if (le16_to_cpu(tl->tl_used)) {
                mlog(0, "We'll have %u logs to recover\n",
                     le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                 * tl_used. */
                tl->tl_used = 0;
 
+               ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
                status = ocfs2_write_block(osb, tl_bh, tl_inode);
                if (status < 0) {
                        mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  */
 
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
  */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
        struct ocfs2_cached_block_free          *f_first;
 };
 
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
-                                  int sysfile_type,
-                                  int slot,
-                                  struct ocfs2_cached_block_free *head)
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+                                   int sysfile_type,
+                                   int slot,
+                                   struct ocfs2_cached_block_free *head)
 {
        int ret;
        u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
        return ret;
 }
 
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                               u64 blkno, unsigned int bit)
+{
+       int ret = 0;
+       struct ocfs2_cached_block_free *item;
+
+       item = kmalloc(sizeof(*item), GFP_NOFS);
+       if (item == NULL) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               return ret;
+       }
+
+       mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+            bit, (unsigned long long)blkno);
+
+       item->free_blk = blkno;
+       item->free_bit = bit;
+       item->free_next = ctxt->c_global_allocator;
+
+       ctxt->c_global_allocator = item;
+       return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+                                     struct ocfs2_cached_block_free *head)
+{
+       struct ocfs2_cached_block_free *tmp;
+       struct inode *tl_inode = osb->osb_tl_inode;
+       handle_t *handle;
+       int ret = 0;
+
+       mutex_lock(&tl_inode->i_mutex);
+
+       while (head) {
+               if (ocfs2_truncate_log_needs_flush(osb)) {
+                       ret = __ocfs2_flush_truncate_log(osb);
+                       if (ret < 0) {
+                               mlog_errno(ret);
+                               break;
+                       }
+               }
+
+               handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       mlog_errno(ret);
+                       break;
+               }
+
+               ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+                                               head->free_bit);
+
+               ocfs2_commit_trans(osb, handle);
+               tmp = head;
+               head = head->free_next;
+               kfree(tmp);
+
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       break;
+               }
+       }
+
+       mutex_unlock(&tl_inode->i_mutex);
+
+       while (head) {
+               /* Premature exit may have left some dangling items. */
+               tmp = head;
+               head = head->free_next;
+               kfree(tmp);
+       }
+
+       return ret;
+}
+
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                if (fl->f_first) {
                        mlog(0, "Free items: (type %u, slot %d)\n",
                             fl->f_inode_type, fl->f_slot);
-                       ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
-                                                      fl->f_slot, fl->f_first);
+                       ret2 = ocfs2_free_cached_blocks(osb,
+                                                       fl->f_inode_type,
+                                                       fl->f_slot,
+                                                       fl->f_first);
                        if (ret2)
                                mlog_errno(ret2);
                        if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                kfree(fl);
        }
 
+       if (ctxt->c_global_allocator) {
+               ret2 = ocfs2_free_cached_clusters(osb,
+                                                 ctxt->c_global_allocator);
+               if (ret2)
+                       mlog_errno(ret2);
+               if (!ret)
+                       ret = ret2;
+
+               ctxt->c_global_allocator = NULL;
+       }
+
        return ret;
 }
 
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
 
        eb = (struct ocfs2_extent_block *) bh->b_data;
        el = &eb->h_list;
-       if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-               ret = -EROFS;
-               goto out;
-       }
+
+       /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
+        * Any corruption is a code bug. */
+       BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 
        *new_last_eb = bh;
        get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
 
        if (last_eb_bh) {
-               status = ocfs2_journal_access(handle, inode, last_eb_bh,
-                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                goto bail;
        }
 
+       vfs_dq_free_space_nodirty(inode,
+                       ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
                mlog_errno(ret);
        else if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-               ret = walk_page_buffers(handle, page_buffers(page),
-                                       from, to, &partial,
-                                       ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        struct page **pages = NULL;
        loff_t end = osb->s_clustersize;
        struct ocfs2_extent_tree et;
+       int did_quota = 0;
 
        has_data = i_size_read(inode) ? 1 : 0;
 
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                }
        }
 
-       handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+       handle = ocfs2_start_trans(osb,
+                                  ocfs2_inline_to_extents_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out_unlock;
        }
 
-       ret = ocfs2_journal_access(handle, inode, di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_journal_access_di(handle, inode, di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                unsigned int page_end;
                u64 phys;
 
+               if (vfs_dq_alloc_space_nodirty(inode,
+                                      ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                       ret = -EDQUOT;
+                       goto out_commit;
+               }
+               did_quota = 1;
+
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        }
 
 out_commit:
+       if (ret < 0 && did_quota)
+               vfs_dq_free_space_nodirty(inode,
+                                         ocfs2_clusters_to_bytes(osb->sb, 1));
+
        ocfs2_commit_trans(osb, handle);
 
 out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
 
-       path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+       path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+                             ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
                mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
        if (fe->id2.i_list.l_tree_depth) {
-               status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
-                                         &last_eb_bh);
+               status = ocfs2_read_extent_block(inode,
+                                                le64_to_cpu(fe->i_last_eb_blk),
+                                                &last_eb_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-
-                       brelse(last_eb_bh);
-                       status = -EIO;
-                       goto bail;
-               }
        }
 
        (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                goto out;
        }
 
-       ret = ocfs2_journal_access(handle, inode, di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_journal_access_di(handle, inode, di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
index 70257c8..cceff5c 100644 (file)
@@ -45,7 +45,9 @@
  *
  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
  * ocfs2_extent_tree_operations abstract the normal operations we do for
  * the root of extent b-tree.
  */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
        struct ocfs2_extent_tree_operations     *et_ops;
        struct buffer_head                      *et_root_bh;
        struct ocfs2_extent_list                *et_root_el;
+       ocfs2_journal_access_func               et_root_journal_access;
        void                                    *et_object;
        unsigned int                            et_max_leaf_clusters;
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                       struct buffer_head *bh,
-                                       struct ocfs2_xattr_value_root *xv);
+                                       struct ocfs2_xattr_value_buf *vb);
+
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                           struct buffer_head **bh);
 
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+                            struct ocfs2_extent_tree *et,
+                            u32 cpos, u32 phys_cpos, u32 len,
+                            struct ocfs2_cached_dealloc_ctxt *dealloc);
+
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
  */
 struct ocfs2_cached_dealloc_ctxt {
        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+       struct ocfs2_cached_block_free          *c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
        c->c_first_suballocator = NULL;
+       c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                               u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+       return c->c_global_allocator != NULL;
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt);
index c22543b..a067a6c 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
 
-       status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+       status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
 
-       if (!OCFS2_IS_VALID_DINODE(fe)) {
-               mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                    (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                    fe->i_signature);
-               goto bail;
-       }
-
        if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
                                                    le32_to_cpu(fe->i_clusters))) {
                mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-       ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+       ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 
        if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-               ret = walk_page_buffers(handle,
-                                       page_buffers(page),
-                                       from, to, NULL,
-                                       ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
                tmppage = wc->w_pages[i];
 
                if (page_has_buffers(tmppage)) {
-                       if (ocfs2_should_order_data(inode)) {
+                       if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                               walk_page_buffers(wc->w_handle,
-                                                 page_buffers(tmppage),
-                                                 from, to, NULL,
-                                                 ocfs2_journal_dirty_data);
-#endif
-                       }
 
                        block_commit_write(tmppage, from, to);
                }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
                goto out;
        }
 
-       ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                ocfs2_commit_trans(osb, handle);
 
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
        wc->w_handle = handle;
 
+       if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+                       ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+               ret = -EDQUOT;
+               goto out_commit;
+       }
        /*
         * We don't want this to fail in ocfs2_write_end(), so do it
         * here.
         */
-       ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out_quota;
        }
 
        /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out_quota;
        }
 
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out_quota;
        }
 
        if (data_ac)
@@ -1790,6 +1776,10 @@ success:
        *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
+out_quota:
+       if (clusters_to_alloc)
+               vfs_dq_free_space(inode,
+                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
        ocfs2_commit_trans(osb, handle);
 
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
 
                if (page_has_buffers(tmppage)) {
-                       if (ocfs2_should_order_data(inode)) {
+                       if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                               walk_page_buffers(wc->w_handle,
-                                                 page_buffers(tmppage),
-                                                 from, to, NULL,
-                                                 ocfs2_journal_dirty_data);
-#endif
-                       }
                        block_commit_write(tmppage, from, to);
                }
        }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644 (file)
index 0000000..2a947c4
--- /dev/null
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+       unsigned int b, p = 0;
+
+       /*
+        * Data bits are 0-based, but we're talking code bits, which
+        * are 1-based.
+        */
+       b = i + 1;
+
+       /* Use the cache if it is there */
+       if (p_cache)
+               p = *p_cache;
+        b += p;
+
+       /*
+        * For every power of two below our bit number, bump our bit.
+        *
+        * We compare with (b + 1) because we have to compare with what b
+        * would be _if_ it were bumped up by the parity bit.  Capice?
+        *
+        * p is set above.
+        */
+       for (; (1 << p) < (b + 1); p++)
+               b++;
+
+       if (p_cache)
+               *p_cache = p;
+
+       return b;
+}
+
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+       unsigned int i, b, p = 0;
+
+       BUG_ON(!d);
+
+       /*
+        * b is the hamming code bit number.  Hamming code specifies a
+        * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+        * for the algorithm.
+        *
+        * The i++ in the for loop is so that the start offset passed
+        * to ocfs2_find_next_bit_set() is one greater than the previously
+        * found bit.
+        */
+       for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+       {
+               /*
+                * i is the offset in this hunk, nr + i is the total bit
+                * offset.
+                */
+               b = calc_code_bit(nr + i, &p);
+
+               /*
+                * Data bits in the resultant code are checked by
+                * parity bits that are part of the bit number
+                * representation.  Huh?
+                *
+                * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+                * In other words, the parity bit at position 2^k
+                * checks bits in positions having bit k set in
+                * their binary representation.  Conversely, for
+                * instance, bit 13, i.e. 1101(2), is checked by
+                * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+                * </wikipedia>
+                *
+                * Note that 'k' is the _code_ bit number.  'b' in
+                * our loop.
+                */
+               parity ^= b;
+       }
+
+       /* While the data buffer was treated as little endian, the
+        * return value is in host endian. */
+       return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+       return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                      unsigned int fix)
+{
+       unsigned int i, b;
+
+       BUG_ON(!d);
+
+       /*
+        * If the bit to fix has an hweight of 1, it's a parity bit.  One
+        * busted parity bit is its own error.  Nothing to do here.
+        */
+       if (hweight32(fix) == 1)
+               return;
+
+       /*
+        * nr + d is the bit right past the data hunk we're looking at.
+        * If fix after that, nothing to do
+        */
+       if (fix >= calc_code_bit(nr + d, NULL))
+               return;
+
+       /*
+        * nr is the offset in the data hunk we're starting at.  Let's
+        * start b at the offset in the code buffer.  See hamming_encode()
+        * for a more detailed description of 'b'.
+        */
+       b = calc_code_bit(nr, NULL);
+       /* If the fix is before this hunk, nothing to do */
+       if (fix < b)
+               return;
+
+       for (i = 0; i < d; i++, b++)
+       {
+               /* Skip past parity bits */
+               while (hweight32(b) == 1)
+                       b++;
+
+               /*
+                * i is the offset in this data hunk.
+                * nr + i is the offset in the total data buffer.
+                * b is the offset in the total code buffer.
+                *
+                * Thus, when b == fix, bit i in the current hunk needs
+                * fixing.
+                */
+               if (b == fix)
+               {
+                       if (ocfs2_test_bit(i, data))
+                               ocfs2_clear_bit(i, data);
+                       else
+                               ocfs2_set_bit(i, data);
+                       break;
+               }
+       }
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                            unsigned int fix)
+{
+       ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                              struct ocfs2_block_check *bc)
+{
+       u32 crc;
+       u32 ecc;
+
+       memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+       crc = crc32_le(~0, data, blocksize);
+       ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+       /*
+        * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+        * larger than 16 bits.
+        */
+       BUG_ON(ecc > USHORT_MAX);
+
+       bc->bc_crc32e = cpu_to_le32(crc);
+       bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                              struct ocfs2_block_check *bc)
+{
+       int rc = 0;
+       struct ocfs2_block_check check;
+       u32 crc, ecc;
+
+       check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+       check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+       memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+       /* Fast path - if the crc32 validates, we're good to go */
+       crc = crc32_le(~0, data, blocksize);
+       if (crc == check.bc_crc32e)
+               goto out;
+
+       mlog(ML_ERROR,
+            "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+       /* Ok, try ECC fixups */
+       ecc = ocfs2_hamming_encode_block(data, blocksize);
+       ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+       /* And check the crc32 again */
+       crc = crc32_le(~0, data, blocksize);
+       if (crc == check.bc_crc32e)
+               goto out;
+
+       mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+       rc = -EIO;
+
+out:
+       bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+       bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+       return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                  struct ocfs2_block_check *bc)
+{
+       int i;
+       u32 crc, ecc;
+
+       BUG_ON(nr < 0);
+
+       if (!nr)
+               return;
+
+       memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+       for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+               crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+               /*
+                * The number of bits in a buffer is obviously b_size*8.
+                * The offset of this buffer is b_size*i, so the bit offset
+                * of this buffer is b_size*8*i.
+                */
+               ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                               bhs[i]->b_size * 8,
+                                               bhs[i]->b_size * 8 * i);
+       }
+
+       /*
+        * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+        * larger than 16 bits.
+        */
+       BUG_ON(ecc > USHORT_MAX);
+
+       bc->bc_crc32e = cpu_to_le32(crc);
+       bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                  struct ocfs2_block_check *bc)
+{
+       int i, rc = 0;
+       struct ocfs2_block_check check;
+       u32 crc, ecc, fix;
+
+       BUG_ON(nr < 0);
+
+       if (!nr)
+               return 0;
+
+       check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+       check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+       memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+       /* Fast path - if the crc32 validates, we're good to go */
+       for (i = 0, crc = ~0; i < nr; i++)
+               crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+       if (crc == check.bc_crc32e)
+               goto out;
+
+       mlog(ML_ERROR,
+            "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+       /* Ok, try ECC fixups */
+       for (i = 0, ecc = 0; i < nr; i++) {
+               /*
+                * The number of bits in a buffer is obviously b_size*8.
+                * The offset of this buffer is b_size*i, so the bit offset
+                * of this buffer is b_size*8*i.
+                */
+               ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                               bhs[i]->b_size * 8,
+                                               bhs[i]->b_size * 8 * i);
+       }
+       fix = ecc ^ check.bc_ecc;
+       for (i = 0; i < nr; i++) {
+               /*
+                * Try the fix against each buffer.  It will only affect
+                * one of them.
+                */
+               ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+                                 bhs[i]->b_size * 8 * i, fix);
+       }
+
+       /* And check the crc32 again */
+       for (i = 0, crc = ~0; i < nr; i++)
+               crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+       if (crc == check.bc_crc32e)
+               goto out;
+
+       mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+       rc = -EIO;
+
+out:
+       bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+       bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+       return rc;
+}
+
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                           struct ocfs2_block_check *bc)
+{
+       if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+               ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                           struct ocfs2_block_check *bc)
+{
+       int rc = 0;
+
+       if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+               rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+
+       return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                               struct buffer_head **bhs, int nr,
+                               struct ocfs2_block_check *bc)
+{
+       if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+               ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                               struct buffer_head **bhs, int nr,
+                               struct ocfs2_block_check *bc)
+{
+       int rc = 0;
+
+       if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+               rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+
+       return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644 (file)
index 0000000..70ec3fe
--- /dev/null
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                           struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                           struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                               struct buffer_head **bhs, int nr,
+                               struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                               struct buffer_head **bhs, int nr,
+                               struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                              struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                              struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                  struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                  struct ocfs2_block_check *bc);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+                        unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                      unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                                   unsigned int fix);
+#endif
index 3a178ec..15c8e6d 100644 (file)
 
 #include "buffer_head_io.h"
 
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+       BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                      struct inode *inode)
 {
@@ -166,7 +178,9 @@ bail:
 }
 
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-                     struct buffer_head *bhs[], int flags)
+                     struct buffer_head *bhs[], int flags,
+                     int (*validate)(struct super_block *sb,
+                                     struct buffer_head *bh))
 {
        int status = 0;
        int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
+                       if (validate)
+                               set_buffer_needs_validate(bh);
                        bh->b_end_io = end_buffer_read_sync;
                        submit_bh(READ, bh);
                        continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                                bhs[i] = NULL;
                                continue;
                        }
+
+                       if (buffer_needs_validate(bh)) {
+                               /* We never set NeedsValidate if the
+                                * buffer was held by the journal, so
+                                * that better not have changed */
+                               BUG_ON(buffer_jbd(bh));
+                               clear_buffer_needs_validate(bh);
+                               status = validate(inode->i_sb, bh);
+                               if (status) {
+                                       put_bh(bh);
+                                       bhs[i] = NULL;
+                                       continue;
+                               }
+                       }
                }
 
                /* Always set the buffer in the cache, even if it was
index 75e1dcb..c75d682 100644 (file)
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
                             int uptodate);
 
-static inline int ocfs2_read_block(struct inode               *inode,
-                                  u64                  off,
-                                  struct buffer_head **bh);
-
 int ocfs2_write_block(struct ocfs2_super          *osb,
                      struct buffer_head  *bh,
                      struct inode        *inode);
-int ocfs2_read_blocks(struct inode       *inode,
-                     u64                  block,
-                     int                  nr,
-                     struct buffer_head  *bhs[],
-                     int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                           unsigned int nr, struct buffer_head *bhs[]);
 
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+                     struct buffer_head *bhs[], int flags,
+                     int (*validate)(struct super_block *sb,
+                                     struct buffer_head *bh));
+
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh);
 
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-                                  struct buffer_head **bh)
+                                  struct buffer_head **bh,
+                                  int (*validate)(struct super_block *sb,
+                                                  struct buffer_head *bh))
 {
        int status = 0;
 
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
                goto bail;
        }
 
-       status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+       status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 
 bail:
        return status;
index d8a0cb9..96df541 100644 (file)
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUORUM),
        define_mask(EXPORT),
        define_mask(XATTR),
+       define_mask(QUOTA),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
index 57670c6..7e72a81 100644 (file)
 #define ML_QUORUM      0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT      0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR       0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA       0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR       0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE      0x0000000200000000ULL /* setn to KERN_NOTICE */
index 026e6eb..f2c4098 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
 
-static struct buffer_head *ocfs2_bread(struct inode *inode,
-                                      int block, int *err, int reada)
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
-       struct buffer_head *bh = NULL;
-       int tmperr;
-       u64 p_blkno;
-       int readflags = 0;
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               return 0;
 
-       if (reada)
-               readflags |= OCFS2_BH_READAHEAD;
+       return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+}
 
-       if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-           i_size_read(inode)) {
-               BUG_ON(!reada);
-               return NULL;
-       }
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+{
+       return ocfs2_meta_ecc(osb);
+}
 
-       down_read(&OCFS2_I(inode)->ip_alloc_sem);
-       tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-                                            NULL);
-       up_read(&OCFS2_I(inode)->ip_alloc_sem);
-       if (tmperr < 0) {
-               mlog_errno(tmperr);
-               goto fail;
-       }
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+       return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
 
-       tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
-       if (tmperr < 0)
-               goto fail;
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
 
-       tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                           void *data)
+{
+       char *p = data;
 
-       *err = 0;
-       return bh;
+       p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+       return (struct ocfs2_dir_block_trailer *)p;
+}
 
-fail:
-       brelse(bh);
-       bh = NULL;
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+                                 struct ocfs2_dir_entry *de,
+                                 unsigned long offset,
+                                 unsigned long blklen)
+{
+       unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
 
-       *err = -EIO;
-       return NULL;
+       if (!ocfs2_dir_has_trailer(dir))
+               return 0;
+
+       if (offset != toff)
+               return 0;
+
+       return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+                                  struct buffer_head *bh)
+{
+       struct ocfs2_dir_block_trailer *trailer;
+
+       trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+       strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+       trailer->db_compat_rec_len =
+                       cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+       trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+       trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
 
-       ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+       ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -250,6 +277,108 @@ out:
        return NULL;
 }
 
+static int ocfs2_validate_dir_block(struct super_block *sb,
+                                   struct buffer_head *bh)
+{
+       int rc;
+       struct ocfs2_dir_block_trailer *trailer =
+               ocfs2_trailer_from_bh(bh, sb);
+
+
+       /*
+        * We don't validate dirents here, that's handled
+        * in-place when the code walks them.
+        */
+       mlog(0, "Validating dirblock %llu\n",
+            (unsigned long long)bh->b_blocknr);
+
+       BUG_ON(!buffer_uptodate(bh));
+
+       /*
+        * If the ecc fails, we return the error but otherwise
+        * leave the filesystem running.  We know any error is
+        * local to this block.
+        *
+        * Note that we are safe to call this even if the directory
+        * doesn't have a trailer.  Filesystems without metaecc will do
+        * nothing, and filesystems with it will have one.
+        */
+       rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+       if (rc)
+               mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                    (unsigned long long)bh->b_blocknr);
+
+       return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+                               struct buffer_head **bh, int flags)
+{
+       int rc = 0;
+       struct buffer_head *tmp = *bh;
+       struct ocfs2_dir_block_trailer *trailer;
+
+       rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+                                   ocfs2_validate_dir_block);
+       if (rc) {
+               mlog_errno(rc);
+               goto out;
+       }
+
+       /*
+        * We check the trailer here rather than in
+        * ocfs2_validate_dir_block() because that function doesn't have
+        * the inode to test.
+        */
+       if (!(flags & OCFS2_BH_READAHEAD) &&
+           ocfs2_dir_has_trailer(inode)) {
+               trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+               if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                       rc = -EINVAL;
+                       ocfs2_error(inode->i_sb,
+                                   "Invalid dirblock #%llu: "
+                                   "signature = %.*s\n",
+                                   (unsigned long long)tmp->b_blocknr, 7,
+                                   trailer->db_signature);
+                       goto out;
+               }
+               if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+                       rc = -EINVAL;
+                       ocfs2_error(inode->i_sb,
+                                   "Directory block #%llu has an invalid "
+                                   "db_blkno of %llu",
+                                   (unsigned long long)tmp->b_blocknr,
+                                   (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                       goto out;
+               }
+               if (le64_to_cpu(trailer->db_parent_dinode) !=
+                   OCFS2_I(inode)->ip_blkno) {
+                       rc = -EINVAL;
+                       ocfs2_error(inode->i_sb,
+                                   "Directory block #%llu on dinode "
+                                   "#%llu has an invalid parent_dinode "
+                                   "of %llu",
+                                   (unsigned long long)tmp->b_blocknr,
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                   (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                       goto out;
+               }
+       }
+
+       /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+       if (!*bh)
+               *bh = tmp;
+
+out:
+       return rc ? -EIO : 0;
+}
+
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
                                }
                                num++;
 
-                               bh = ocfs2_bread(dir, b++, &err, 1);
+                               bh = NULL;
+                               err = ocfs2_read_dir_block(dir, b++, &bh,
+                                                          OCFS2_BH_READAHEAD);
                                bh_use[ra_max] = bh;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
-               if (ocfs2_read_block(dir, block, &bh)) {
+               if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
                        /* read error, skip block & hope for the best.
-                        * ocfs2_read_block() has released the bh. */
+                        * ocfs2_read_dir_block() has released the bh. */
                        ocfs2_error(dir->i_sb, "reading directory %llu, "
                                    "offset %lu\n",
                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
                       struct inode *new_entry_inode)
 {
        int ret;
+       ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
        /*
         * The same code works fine for both inline-data and extent
-        * based directories, so no need to split this up.
+        * based directories, so no need to split this up.  The only
+        * difference is the journal_access function.
         */
 
-       ret = ocfs2_journal_access(handle, dir, de_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               access = ocfs2_journal_access_di;
+
+       ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
        struct ocfs2_dir_entry *de, *pde;
        int i, status = -ENOENT;
+       ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               access = ocfs2_journal_access_di;
+
        i = 0;
        pde = NULL;
        de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        goto bail;
                }
                if (de == de_del)  {
-                       status = ocfs2_journal_access(handle, dir, bh,
-                                                     OCFS2_JOURNAL_ACCESS_WRITE);
+                       status = access(handle, dir, bh,
+                                       OCFS2_JOURNAL_ACCESS_WRITE);
                        if (status < 0) {
                                status = -EIO;
                                mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
 
-       ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+       ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
                        goto bail;
                }
 
+               /* We're guaranteed that we should have space, so we
+                * can't possibly have hit the trailer...right? */
+               mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+                               "Hit dir trailer trying to insert %.*s "
+                               "(namelen %d) into directory %llu.  "
+                               "offset is %lu, trailer offset is %d\n",
+                               namelen, name, namelen,
+                               (unsigned long long)parent_fe_bh->b_blocknr,
+                               offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
                        retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
                                goto bail;
                        }
 
-                       status = ocfs2_journal_access(handle, dir, insert_bh,
-                                                     OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (insert_bh == parent_fe_bh)
+                               status = ocfs2_journal_access_di(handle, dir,
+                                                                insert_bh,
+                                                                OCFS2_JOURNAL_ACCESS_WRITE);
+                       else
+                               status = ocfs2_journal_access_db(handle, dir,
+                                                                insert_bh,
+                                                                OCFS2_JOURNAL_ACCESS_WRITE);
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
                        retval = 0;
                        goto bail;
                }
+
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
        }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        struct ocfs2_inline_data *data;
        struct ocfs2_dir_entry *de;
 
-       ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+       ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        int i, stored;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
-       int err;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
 
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 
        while (!error && !stored && *f_pos < i_size_read(inode)) {
                blk = (*f_pos) >> sb->s_blocksize_bits;
-               bh = ocfs2_bread(inode, blk, &err, 0);
-               if (!bh) {
-                       mlog(ML_ERROR,
-                            "directory #%llu contains a hole at offset %lld\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            *f_pos);
+               if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+                       /* Skip the corrupt dirblock and keep trying */
                        *f_pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
-                               tmp = ocfs2_bread(inode, ++blk, &err, 1);
-                               brelse(tmp);
+                               tmp = NULL;
+                               if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+                                                         OCFS2_BH_READAHEAD))
+                                       brelse(tmp);
                        }
                        last_ra_blk = blk;
                        ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
                }
                offset = 0;
                brelse(bh);
+               bh = NULL;
        }
 
        stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
        return !priv.seen_other;
 }
 
-static void ocfs2_fill_initial_dirents(struct inode *inode,
-                                      struct inode *parent,
-                                      char *start, unsigned int size)
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+                                                         struct inode *parent,
+                                                         char *start,
+                                                         unsigned int size)
 {
        struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
 
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
        de->name_len = 2;
        strcpy(de->name, "..");
        ocfs2_set_de_type(de, S_IFDIR);
+
+       return de;
 }
 
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        struct ocfs2_inline_data *data = &di->id2.i_data;
        unsigned int size = le16_to_cpu(data->id_count);
 
-       ret = ocfs2_journal_access(handle, inode, di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_journal_access_di(handle, inode, di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct ocfs2_alloc_context *data_ac)
 {
        int status;
+       unsigned int size = osb->sb->s_blocksize;
        struct buffer_head *new_bh = NULL;
+       struct ocfs2_dir_entry *de;
 
        mlog_entry_void();
 
+       if (ocfs2_supports_dir_trailer(osb))
+               size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
                                     data_ac, NULL, &new_bh);
        if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
        ocfs2_set_new_buffer_uptodate(inode, new_bh);
 
-       status = ocfs2_journal_access(handle, inode, new_bh,
-                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       status = ocfs2_journal_access_db(handle, inode, new_bh,
+                                        OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
-       ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
-                                  osb->sb->s_blocksize);
+       de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+       if (ocfs2_supports_dir_trailer(osb))
+               ocfs2_init_dir_trailer(inode, new_bh);
 
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                                     data_ac);
 }
 
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                    unsigned int new_size)
+                                    struct super_block *sb)
 {
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
-       unsigned int bytes = new_size - old_size;
+       unsigned int new_size = sb->s_blocksize;
+       unsigned int bytes;
+
+       if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+               new_size = ocfs2_dir_trailer_blk_off(sb);
+
+       bytes = new_size - old_size;
 
        limit = start + old_size;
        de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
                                   struct buffer_head **first_block_bh)
 {
-       int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
        u32 alloc, bit_off, len;
        struct super_block *sb = dir->i_sb;
+       int ret, credits = ocfs2_inline_to_extents_credits(sb);
        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+       int did_quota = 0;
 
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);