Merge branch 'write_inode2' of git://git.kernel.org/pub/scm/linux/kernel/git/viro...
Linus Torvalds [Fri, 5 Mar 2010 19:53:53 +0000 (11:53 -0800)]
* 'write_inode2' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6:
  pass writeback_control to ->write_inode
  make sure data is on disk before calling ->write_inode

69 files changed:
Documentation/kprobes.txt
arch/Kconfig
arch/x86/Kconfig
arch/x86/include/asm/alternative.h
arch/x86/include/asm/kprobes.h
arch/x86/kernel/alternative.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/kprobes.c
crypto/ahash.c
crypto/authenc.c
crypto/md5.c
fs/ext4/balloc.c
fs/ext4/block_validity.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/migrate.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/namei.c
fs/squashfs/Makefile
fs/squashfs/block.c
fs/squashfs/cache.c
fs/squashfs/decompressor.c [new file with mode: 0644]
fs/squashfs/decompressor.h [new file with mode: 0644]
fs/squashfs/dir.c
fs/squashfs/export.c
fs/squashfs/file.c
fs/squashfs/fragment.c
fs/squashfs/id.c
fs/squashfs/inode.c
fs/squashfs/namei.c
fs/squashfs/squashfs.h
fs/squashfs/squashfs_fs.h
fs/squashfs/squashfs_fs_sb.h
fs/squashfs/super.c
fs/squashfs/symlink.c
fs/squashfs/zlib_wrapper.c [new file with mode: 0644]
include/linux/jbd2.h
include/linux/kprobes.h
include/trace/events/ext4.h
include/trace/events/jbd2.h
kernel/kprobes.c
kernel/padata.c
kernel/sysctl.c
tools/perf/Documentation/perf-probe.txt
tools/perf/Makefile
tools/perf/builtin-probe.c
tools/perf/util/probe-event.c
tools/perf/util/probe-finder.c
tools/perf/util/probe-finder.h
tools/perf/util/string.c
tools/perf/util/string.h

index 053037a..2f9115c 100644 (file)
@@ -1,6 +1,7 @@
 Title  : Kernel Probes (Kprobes)
 Authors        : Jim Keniston <jkenisto@us.ibm.com>
-       : Prasanna S Panchamukhi <prasanna@in.ibm.com>
+       : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com>
+       : Masami Hiramatsu <mhiramat@redhat.com>
 
 CONTENTS
 
@@ -15,6 +16,7 @@ CONTENTS
 9. Jprobes Example
 10. Kretprobes Example
 Appendix A: The kprobes debugfs interface
+Appendix B: The kprobes sysctl interface
 
 1. Concepts: Kprobes, Jprobes, Return Probes
 
@@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions
 can speed up unregistration process when you have to unregister
 a lot of probes at once.
 
-The next three subsections explain how the different types of
-probes work.  They explain certain things that you'll need to
-know in order to make the best use of Kprobes -- e.g., the
-difference between a pre_handler and a post_handler, and how
-to use the maxactive and nmissed fields of a kretprobe.  But
-if you're in a hurry to start using Kprobes, you can skip ahead
-to section 2.
+The next four subsections explain how the different types of
+probes work and how jump optimization works.  They explain certain
+things that you'll need to know in order to make the best use of
+Kprobes -- e.g., the difference between a pre_handler and
+a post_handler, and how to use the maxactive and nmissed fields of
+a kretprobe.  But if you're in a hurry to start using Kprobes, you
+can skip ahead to section 2.
 
 1.1 How Does a Kprobe Work?
 
@@ -161,13 +163,125 @@ In case probed function is entered but there is no kretprobe_instance
 object available, then in addition to incrementing the nmissed count,
 the user entry_handler invocation is also skipped.
 
+1.4 How Does Jump Optimization Work?
+
+If you configured your kernel with CONFIG_OPTPROBES=y (currently
+this option is supported on x86/x86-64, non-preemptive kernel) and
+the "debug.kprobes_optimization" kernel parameter is set to 1 (see
+sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
+instruction instead of a breakpoint instruction at each probepoint.
+
+1.4.1 Init a Kprobe
+
+When a probe is registered, before attempting this optimization,
+Kprobes inserts an ordinary, breakpoint-based kprobe at the specified
+address. So, even if it's not possible to optimize this particular
+probepoint, there'll be a probe there.
+
+1.4.2 Safety Check
+
+Before optimizing a probe, Kprobes performs the following safety checks:
+
+- Kprobes verifies that the region that will be replaced by the jump
+instruction (the "optimized region") lies entirely within one function.
+(A jump instruction is multiple bytes, and so may overlay multiple
+instructions.)
+
+- Kprobes analyzes the entire function and verifies that there is no
+jump into the optimized region.  Specifically:
+  - the function contains no indirect jump;
+  - the function contains no instruction that causes an exception (since
+  the fixup code triggered by the exception could jump back into the
+  optimized region -- Kprobes checks the exception tables to verify this);
+  and
+  - there is no near jump to the optimized region (other than to the first
+  byte).
+
+- For each instruction in the optimized region, Kprobes verifies that
+the instruction can be executed out of line.
+
+1.4.3 Preparing Detour Buffer
+
+Next, Kprobes prepares a "detour" buffer, which contains the following
+instruction sequence:
+- code to push the CPU's registers (emulating a breakpoint trap)
+- a call to the trampoline code which calls user's probe handlers.
+- code to restore registers
+- the instructions from the optimized region
+- a jump back to the original execution path.
+
+1.4.4 Pre-optimization
+
+After preparing the detour buffer, Kprobes verifies that none of the
+following situations exist:
+- The probe has either a break_handler (i.e., it's a jprobe) or a
+post_handler.
+- Other instructions in the optimized region are probed.
+- The probe is disabled.
+In any of the above cases, Kprobes won't start optimizing the probe.
+Since these are temporary situations, Kprobes tries to start
+optimizing it again if the situation is changed.
+
+If the kprobe can be optimized, Kprobes enqueues the kprobe to an
+optimizing list, and kicks the kprobe-optimizer workqueue to optimize
+it.  If the to-be-optimized probepoint is hit before being optimized,
+Kprobes returns control to the original instruction path by setting
+the CPU's instruction pointer to the copied code in the detour buffer
+-- thus at least avoiding the single-step.
+
+1.4.5 Optimization
+
+The Kprobe-optimizer doesn't insert the jump instruction immediately;
+rather, it calls synchronize_sched() for safety first, because it's
+possible for a CPU to be interrupted in the middle of executing the
+optimized region(*).  As you know, synchronize_sched() can ensure
+that all interruptions that were active when synchronize_sched()
+was called are done, but only if CONFIG_PREEMPT=n.  So, this version
+of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**)
+
+After that, the Kprobe-optimizer calls stop_machine() to replace
+the optimized region with a jump instruction to the detour buffer,
+using text_poke_smp().
+
+1.4.6 Unoptimization
+
+When an optimized kprobe is unregistered, disabled, or blocked by
+another kprobe, it will be unoptimized.  If this happens before
+the optimization is complete, the kprobe is just dequeued from the
+optimized list.  If the optimization has been done, the jump is
+replaced with the original code (except for an int3 breakpoint in
+the first byte) by using text_poke_smp().
+
+(*)Please imagine that the 2nd instruction is interrupted and then
+the optimizer replaces the 2nd instruction with the jump *address*
+while the interrupt handler is running. When the interrupt
+returns to original address, there is no valid instruction,
+and it causes an unexpected result.
+
+(**)This optimization-safety checking may be replaced with the
+stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y
+kernel.
+
+NOTE for geeks:
+The jump optimization changes the kprobe's pre_handler behavior.
+Without optimization, the pre_handler can change the kernel's execution
+path by changing regs->ip and returning 1.  However, when the probe
+is optimized, that modification is ignored.  Thus, if you want to
+tweak the kernel's execution path, you need to suppress optimization,
+using one of the following techniques:
+- Specify an empty function for the kprobe's post_handler or break_handler.
+ or
+- Config CONFIG_OPTPROBES=n.
+ or
+- Execute 'sysctl -w debug.kprobes_optimization=n'
+
 2. Architectures Supported
 
 Kprobes, jprobes, and return probes are implemented on the following
 architectures:
 
-- i386
-- x86_64 (AMD-64, EM64T)
+- i386 (Supports jump optimization)
+- x86_64 (AMD-64, EM64T) (Supports jump optimization)
 - ppc64
 - ia64 (Does not support probes on instruction slot1.)
 - sparc64 (Return probes not yet implemented.)
@@ -193,6 +307,10 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
 so you can use "objdump -d -l vmlinux" to see the source-to-object
 code mapping.
 
+If you want to reduce probing overhead, set "Kprobes jump optimization
+support" (CONFIG_OPTPROBES) to "y". You can find this option under the
+"Kprobes" line.
+
 4. API Reference
 
 The Kprobes API includes a "register" function and an "unregister"
@@ -389,7 +507,10 @@ the probe which has been registered.
 
 Kprobes allows multiple probes at the same address.  Currently,
 however, there cannot be multiple jprobes on the same function at
-the same time.
+the same time.  Also, a probepoint for which there is a jprobe or
+a post_handler cannot be optimized.  So if you install a jprobe,
+or a kprobe with a post_handler, at an optimized probepoint, the
+probepoint will be unoptimized automatically.
 
 In general, you can install a probe anywhere in the kernel.
 In particular, you can probe interrupt handlers.  Known exceptions
@@ -453,6 +574,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes)
 on the x86_64 version of __switch_to(); the registration functions
 return -EINVAL.
 
+On x86/x86-64, since the Jump Optimization of Kprobes modifies
+instructions widely, there are some limitations to optimization. To
+explain it, we introduce some terminology. Imagine a 3-instruction
+sequence consisting of a two 2-byte instructions and one 3-byte
+instruction.
+
+        IA
+         |
+[-2][-1][0][1][2][3][4][5][6][7]
+        [ins1][ins2][  ins3 ]
+       [<-     DCR       ->]
+          [<- JTPR ->]
+
+ins1: 1st Instruction
+ins2: 2nd Instruction
+ins3: 3rd Instruction
+IA:  Insertion Address
+JTPR: Jump Target Prohibition Region
+DCR: Detoured Code Region
+
+The instructions in DCR are copied to the out-of-line buffer
+of the kprobe, because the bytes in DCR are replaced by
+a 5-byte jump instruction. So there are several limitations.
+
+a) The instructions in DCR must be relocatable.
+b) The instructions in DCR must not include a call instruction.
+c) JTPR must not be targeted by any jump or call instruction.
+d) DCR must not straddle the border betweeen functions.
+
+Anyway, these limitations are checked by the in-kernel instruction
+decoder, so you don't need to worry about that.
+
 6. Probe Overhead
 
 On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0
@@ -476,6 +629,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
 ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
 k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
 
+6.1 Optimized Probe Overhead
+
+Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to
+process. Here are sample overhead figures (in usec) for x86 architectures.
+k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe,
+r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe.
+
+i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
+k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33
+
+x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
+k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30
+
 7. TODO
 
 a. SystemTap (http://sourceware.org/systemtap): Provides a simplified
@@ -523,7 +689,8 @@ is also specified. Following columns show probe status. If the probe is on
 a virtual address that is no longer valid (module init sections, module
 virtual addresses that correspond to modules that've been unloaded),
 such probes are marked with [GONE]. If the probe is temporarily disabled,
-such probes are marked with [DISABLED].
+such probes are marked with [DISABLED]. If the probe is optimized, it is
+marked with [OPTIMIZED].
 
 /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
 
@@ -533,3 +700,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this
 file. Note that this knob just disarms and arms all kprobes and doesn't
 change each probe's disabling state. This means that disabled kprobes (marked
 [DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
+
+
+Appendix B: The kprobes sysctl interface
+
+/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF.
+
+When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides
+a knob to globally and forcibly turn jump optimization (see section
+1.4) ON or OFF. By default, jump optimization is allowed (ON).
+If you echo "0" to this file or set "debug.kprobes_optimization" to
+0 via sysctl, all optimized probes will be unoptimized, and any new
+probes registered after that will not be optimized.  Note that this
+knob *changes* the optimized state. This means that optimized probes
+(marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be
+removed). If the knob is turned on, they will be optimized again.
+
index 215e460..e5eb133 100644 (file)
@@ -41,6 +41,17 @@ config KPROBES
          for kernel debugging, non-intrusive instrumentation and testing.
          If in doubt, say "N".
 
+config OPTPROBES
+       bool "Kprobes jump optimization support (EXPERIMENTAL)"
+       default y
+       depends on KPROBES
+       depends on !PREEMPT
+       depends on HAVE_OPTPROBES
+       select KALLSYMS_ALL
+       help
+         This option will allow kprobes to optimize breakpoint to
+         a jump for reducing its overhead.
+
 config HAVE_EFFICIENT_UNALIGNED_ACCESS
        bool
        help
@@ -83,6 +94,8 @@ config HAVE_KPROBES
 config HAVE_KRETPROBES
        bool
 
+config HAVE_OPTPROBES
+       bool
 #
 # An arch should select this if it provides all these things:
 #
index 57ccdce..f15f37b 100644 (file)
@@ -31,6 +31,7 @@ config X86
        select ARCH_WANT_FRAME_POINTERS
        select HAVE_DMA_ATTRS
        select HAVE_KRETPROBES
+       select HAVE_OPTPROBES
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FUNCTION_TRACER
index f1e253c..b09ec55 100644 (file)
@@ -165,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
  * invalid instruction possible) or if the instructions are changed from a
  * consistent state to another consistent state atomically.
  * More care must be taken when modifying code in the SMP case because of
- * Intel's errata.
+ * Intel's errata. text_poke_smp() takes care that errata, but still
+ * doesn't support NMI/MCE handler code modifying.
  * On the local CPU you need to be protected again NMI or MCE handlers seeing an
  * inconsistent instruction while you patch.
  */
 extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
 
 #endif /* _ASM_X86_ALTERNATIVE_H */
index 4fe681d..4ffa345 100644 (file)
@@ -32,7 +32,10 @@ struct kprobe;
 
 typedef u8 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION 0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define RELATIVEJUMP_OPCODE 0xe9
+#define RELATIVEJUMP_SIZE 5
+#define RELATIVECALL_OPCODE 0xe8
+#define RELATIVE_ADDR_SIZE 4
 #define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR)                                          \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
 
 #define flush_insn_slot(p)     do { } while (0)
 
+/* optinsn template addresses */
+extern kprobe_opcode_t optprobe_template_entry;
+extern kprobe_opcode_t optprobe_template_val;
+extern kprobe_opcode_t optprobe_template_call;
+extern kprobe_opcode_t optprobe_template_end;
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTINSN_SIZE                               \
+       (((unsigned long)&optprobe_template_end -       \
+         (unsigned long)&optprobe_template_entry) +    \
+        MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
+
 extern const int kretprobe_blacklist_size;
 
 void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
        int boostable;
 };
 
+struct arch_optimized_insn {
+       /* copy of the original instructions */
+       kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+       /* detour code buffer */
+       kprobe_opcode_t *insn;
+       /* the size of instructions copied to detour code buffer */
+       size_t size;
+};
+
+/* Return true (!0) if optinsn is prepared for optimization. */
+static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+       return optinsn->size;
+}
+
 struct prev_kprobe {
        struct kprobe *kp;
        unsigned long status;
index e6ea034..3a4bf35 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/memory.h>
+#include <linux/stop_machine.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
@@ -572,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
        local_irq_restore(flags);
        return addr;
 }
+
+/*
+ * Cross-modifying kernel text with stop_machine().
+ * This code originally comes from immediate value.
+ */
+static atomic_t stop_machine_first;
+static int wrote_text;
+
+struct text_poke_params {
+       void *addr;
+       const void *opcode;
+       size_t len;
+};
+
+static int __kprobes stop_machine_text_poke(void *data)
+{
+       struct text_poke_params *tpp = data;
+
+       if (atomic_dec_and_test(&stop_machine_first)) {
+               text_poke(tpp->addr, tpp->opcode, tpp->len);
+               smp_wmb();      /* Make sure other cpus see that this has run */
+               wrote_text = 1;
+       } else {
+               while (!wrote_text)
+                       cpu_relax();
+               smp_mb();       /* Load wrote_text before following execution */
+       }
+
+       flush_icache_range((unsigned long)tpp->addr,
+                          (unsigned long)tpp->addr + tpp->len);
+       return 0;
+}
+
+/**
+ * text_poke_smp - Update instructions on a live kernel on SMP
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. This allows
+ * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
+ * should be allowed, since stop_machine() does _not_ protect code against
+ * NMI and MCE.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
+{
+       struct text_poke_params tpp;
+
+       tpp.addr = addr;
+       tpp.opcode = opcode;
+       tpp.len = len;
+       atomic_set(&stop_machine_first, 1);
+       wrote_text = 0;
+       stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+       return addr;
+}
+
index fe4622e..79556bd 100644 (file)
@@ -145,6 +145,7 @@ struct set_mtrr_data {
 
 /**
  * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * @info: pointer to mtrr configuration data
  *
  * Returns nothing.
  */
index 5de9f4a..b43bbae 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
+#include <linux/ftrace.h>
 
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 };
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
+static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
 {
-       struct __arch_jmp_op {
-               char op;
+       struct __arch_relative_insn {
+               u8 op;
                s32 raddr;
-       } __attribute__((packed)) * jop;
-       jop = (struct __arch_jmp_op *)from;
-       jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
-       jop->op = RELATIVEJUMP_INSTRUCTION;
+       } __attribute__((packed)) *insn;
+
+       insn = (struct __arch_relative_insn *)from;
+       insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+       insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes synthesize_reljump(void *from, void *to)
+{
+       __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
 }
 
 /*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
        /*
         *  Basically, kp->ainsn.insn has an original instruction.
         *  However, RIP-relative instruction can not do single-stepping
-        *  at different place, fix_riprel() tweaks the displacement of
+        *  at different place, __copy_instruction() tweaks the displacement of
         *  that instruction. In that case, we can't recover the instruction
         *  from the kp->ainsn.insn.
         *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 }
 
 /*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
  * If it does, Return the address of the 32-bit displacement word.
  * If not, return null.
  * Only applicable to 64-bit x86.
  */
-static void __kprobes fix_riprel(struct kprobe *p)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 {
-#ifdef CONFIG_X86_64
        struct insn insn;
-       kernel_insn_init(&insn, p->ainsn.insn);
+       int ret;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
 
+       kernel_insn_init(&insn, src);
+       if (recover) {
+               insn_get_opcode(&insn);
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf,
+                                                        (unsigned long)src);
+                       if (ret)
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+       }
+       insn_get_length(&insn);
+       memcpy(dest, insn.kaddr, insn.length);
+
+#ifdef CONFIG_X86_64
        if (insn_rip_relative(&insn)) {
                s64 newdisp;
                u8 *disp;
+               kernel_insn_init(&insn, dest);
                insn_get_displacement(&insn);
                /*
                 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
                 * extension of the original signed 32-bit displacement would
                 * have given.
                 */
-               newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
-                         (u8 *) p->ainsn.insn;
+               newdisp = (u8 *) src + (s64) insn.displacement.value -
+                         (u8 *) dest;
                BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
-               disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+               disp = (u8 *) dest + insn_offset_displacement(&insn);
                *(s32 *) disp = (s32) newdisp;
        }
 #endif
+       return insn.length;
 }
 
 static void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
-       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
-       fix_riprel(p);
+       /*
+        * Copy an instruction without recovering int3, because it will be
+        * put by another subsystem.
+        */
+       __copy_instruction(p->ainsn.insn, p->addr, 0);
 
        if (can_boost(p->addr))
                p->ainsn.boostable = 0;
@@ -406,18 +432,6 @@ static void __kprobes restore_btf(void)
                update_debugctlmsr(current->thread.debugctlmsr);
 }
 
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       clear_btf();
-       regs->flags |= X86_EFLAGS_TF;
-       regs->flags &= ~X86_EFLAGS_IF;
-       /* single step inline if the instruction is an int3 */
-       if (p->opcode == BREAKPOINT_INSTRUCTION)
-               regs->ip = (unsigned long)p->addr;
-       else
-               regs->ip = (unsigned long)p->ainsn.insn;
-}
-
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
                                      struct pt_regs *regs)
 {
@@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
        *sara = (unsigned long) &kretprobe_trampoline;
 }
 
+#ifdef CONFIG_OPTPROBES
+static int  __kprobes setup_detour_execution(struct kprobe *p,
+                                            struct pt_regs *regs,
+                                            int reenter);
+#else
+#define setup_detour_execution(p, regs, reenter) (0)
+#endif
+
 static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
-                                      struct kprobe_ctlblk *kcb)
+                                      struct kprobe_ctlblk *kcb, int reenter)
 {
+       if (setup_detour_execution(p, regs, reenter))
+               return;
+
 #if !defined(CONFIG_PREEMPT)
        if (p->ainsn.boostable == 1 && !p->post_handler) {
                /* Boost up -- we can execute copied instructions directly */
-               reset_current_kprobe();
+               if (!reenter)
+                       reset_current_kprobe();
+               /*
+                * Reentering boosted probe doesn't reset current_kprobe,
+                * nor set current_kprobe, because it doesn't use single
+                * stepping.
+                */
                regs->ip = (unsigned long)p->ainsn.insn;
                preempt_enable_no_resched();
                return;
        }
 #endif
-       prepare_singlestep(p, regs);
-       kcb->kprobe_status = KPROBE_HIT_SS;
+       if (reenter) {
+               save_previous_kprobe(kcb);
+               set_current_kprobe(p, regs, kcb);
+               kcb->kprobe_status = KPROBE_REENTER;
+       } else
+               kcb->kprobe_status = KPROBE_HIT_SS;
+       /* Prepare real single stepping */
+       clear_btf();
+       regs->flags |= X86_EFLAGS_TF;
+       regs->flags &= ~X86_EFLAGS_IF;
+       /* single step inline if the instruction is an int3 */
+       if (p->opcode == BREAKPOINT_INSTRUCTION)
+               regs->ip = (unsigned long)p->addr;
+       else
+               regs->ip = (unsigned long)p->ainsn.insn;
 }
 
 /*
@@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
        switch (kcb->kprobe_status) {
        case KPROBE_HIT_SSDONE:
        case KPROBE_HIT_ACTIVE:
-               save_previous_kprobe(kcb);
-               set_current_kprobe(p, regs, kcb);
                kprobes_inc_nmissed_count(p);
-               prepare_singlestep(p, regs);
-               kcb->kprobe_status = KPROBE_REENTER;
+               setup_singlestep(p, regs, kcb, 1);
                break;
        case KPROBE_HIT_SS:
                /* A probe has been hit in the codepath leading up to, or just
@@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
                         * more here.
                         */
                        if (!p->pre_handler || !p->pre_handler(p, regs))
-                               setup_singlestep(p, regs, kcb);
+                               setup_singlestep(p, regs, kcb, 0);
                        return 1;
                }
        } else if (kprobe_running()) {
                p = __get_cpu_var(current_kprobe);
                if (p->break_handler && p->break_handler(p, regs)) {
-                       setup_singlestep(p, regs, kcb);
+                       setup_singlestep(p, regs, kcb, 0);
                        return 1;
                }
        } /* else: not a kprobe fault; let the kernel handle it */
@@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
        return 0;
 }
 
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING               \
+       /* Skip cs, ip, orig_ax. */     \
+       "       subq $24, %rsp\n"       \
+       "       pushq %rdi\n"           \
+       "       pushq %rsi\n"           \
+       "       pushq %rdx\n"           \
+       "       pushq %rcx\n"           \
+       "       pushq %rax\n"           \
+       "       pushq %r8\n"            \
+       "       pushq %r9\n"            \
+       "       pushq %r10\n"           \
+       "       pushq %r11\n"           \
+       "       pushq %rbx\n"           \
+       "       pushq %rbp\n"           \
+       "       pushq %r12\n"           \
+       "       pushq %r13\n"           \
+       "       pushq %r14\n"           \
+       "       pushq %r15\n"
+#define RESTORE_REGS_STRING            \
+       "       popq %r15\n"            \
+       "       popq %r14\n"            \
+       "       popq %r13\n"            \
+       "       popq %r12\n"            \
+       "       popq %rbp\n"            \
+       "       popq %rbx\n"            \
+       "       popq %r11\n"            \
+       "       popq %r10\n"            \
+       "       popq %r9\n"             \
+       "       popq %r8\n"             \
+       "       popq %rax\n"            \
+       "       popq %rcx\n"            \
+       "       popq %rdx\n"            \
+       "       popq %rsi\n"            \
+       "       popq %rdi\n"            \
+       /* Skip orig_ax, ip, cs */      \
+       "       addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING               \
+       /* Skip cs, ip, orig_ax and gs. */      \
+       "       subl $16, %esp\n"       \
+       "       pushl %fs\n"            \
+       "       pushl %ds\n"            \
+       "       pushl %es\n"            \
+       "       pushl %eax\n"           \
+       "       pushl %ebp\n"           \
+       "       pushl %edi\n"           \
+       "       pushl %esi\n"           \
+       "       pushl %edx\n"           \
+       "       pushl %ecx\n"           \
+       "       pushl %ebx\n"
+#define RESTORE_REGS_STRING            \
+       "       popl %ebx\n"            \
+       "       popl %ecx\n"            \
+       "       popl %edx\n"            \
+       "       popl %esi\n"            \
+       "       popl %edi\n"            \
+       "       popl %ebp\n"            \
+       "       popl %eax\n"            \
+       /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+       "       addl $24, %esp\n"
+#endif
+
 /*
  * When a retprobed function returns, this code saves registers and
  * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
                        /* We don't bother saving the ss register */
                        "       pushq %rsp\n"
                        "       pushfq\n"
-                       /*
-                        * Skip cs, ip, orig_ax.
-                        * trampoline_handler() will plug in these values
-                        */
-                       "       subq $24, %rsp\n"
-                       "       pushq %rdi\n"
-                       "       pushq %rsi\n"
-                       "       pushq %rdx\n"
-                       "       pushq %rcx\n"
-                       "       pushq %rax\n"
-                       "       pushq %r8\n"
-                       "       pushq %r9\n"
-                       "       pushq %r10\n"
-                       "       pushq %r11\n"
-                       "       pushq %rbx\n"
-                       "       pushq %rbp\n"
-                       "       pushq %r12\n"
-                       "       pushq %r13\n"
-                       "       pushq %r14\n"
-                       "       pushq %r15\n"
+                       SAVE_REGS_STRING
                        "       movq %rsp, %rdi\n"
                        "       call trampoline_handler\n"
                        /* Replace saved sp with true return address. */
                        "       movq %rax, 152(%rsp)\n"
-                       "       popq %r15\n"
-                       "       popq %r14\n"
-                       "       popq %r13\n"
-                       "       popq %r12\n"
-                       "       popq %rbp\n"
-                       "       popq %rbx\n"
-                       "       popq %r11\n"
-                       "       popq %r10\n"
-                       "       popq %r9\n"
-                       "       popq %r8\n"
-                       "       popq %rax\n"
-                       "       popq %rcx\n"
-                       "       popq %rdx\n"
-                       "       popq %rsi\n"
-                       "       popq %rdi\n"
-                       /* Skip orig_ax, ip, cs */
-                       "       addq $24, %rsp\n"
+                       RESTORE_REGS_STRING
                        "       popfq\n"
 #else
                        "       pushf\n"
-                       /*
-                        * Skip cs, ip, orig_ax and gs.
-                        * trampoline_handler() will plug in these values
-                        */
-                       "       subl $16, %esp\n"
-                       "       pushl %fs\n"
-                       "       pushl %es\n"
-                       "       pushl %ds\n"
-                       "       pushl %eax\n"
-                       "       pushl %ebp\n"
-                       "       pushl %edi\n"
-                       "       pushl %esi\n"
-                       "       pushl %edx\n"
-                       "       pushl %ecx\n"
-                       "       pushl %ebx\n"
+                       SAVE_REGS_STRING
                        "       movl %esp, %eax\n"
                        "       call trampoline_handler\n"
                        /* Move flags to cs */
@@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
                        "       movl %edx, 52(%esp)\n"
                        /* Replace saved flags with true return address. */
                        "       movl %eax, 56(%esp)\n"
-                       "       popl %ebx\n"
-                       "       popl %ecx\n"
-                       "       popl %edx\n"
-                       "       popl %esi\n"
-                       "       popl %edi\n"
-                       "       popl %ebp\n"
-                       "       popl %eax\n"
-                       /* Skip ds, es, fs, gs, orig_ax and ip */
-                       "       addl $24, %esp\n"
+                       RESTORE_REGS_STRING
                        "       popf\n"
 #endif
                        "       ret\n");
@@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
                         * These instructions can be executed directly if it
                         * jumps back to correct address.
                         */
-                       set_jmp_op((void *)regs->ip,
-                                  (void *)orig_ip + (regs->ip - copy_ip));
+                       synthesize_reljump((void *)regs->ip,
+                               (void *)orig_ip + (regs->ip - copy_ip));
                        p->ainsn.boostable = 1;
                } else {
                        p->ainsn.boostable = -1;
@@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
 
+
+#ifdef CONFIG_OPTPROBES
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+static void __kprobes synthesize_relcall(void *from, void *to)
+{
+       __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
+                                         unsigned long val)
+{
+#ifdef CONFIG_X86_64
+       *addr++ = 0x48;
+       *addr++ = 0xbf;
+#else
+       *addr++ = 0xb8;
+#endif
+       *(unsigned long *)addr = val;
+}
+
+void __kprobes kprobes_optinsn_template_holder(void)
+{
+       asm volatile (
+                       ".global optprobe_template_entry\n"
+                       "optprobe_template_entry: \n"
+#ifdef CONFIG_X86_64
+                       /* We don't bother saving the ss register */
+                       "       pushq %rsp\n"
+                       "       pushfq\n"
+                       SAVE_REGS_STRING
+                       "       movq %rsp, %rsi\n"
+                       ".global optprobe_template_val\n"
+                       "optprobe_template_val: \n"
+                       ASM_NOP5
+                       ASM_NOP5
+                       ".global optprobe_template_call\n"
+                       "optprobe_template_call: \n"
+                       ASM_NOP5
+                       /* Move flags to rsp */
+                       "       movq 144(%rsp), %rdx\n"
+                       "       movq %rdx, 152(%rsp)\n"
+                       RESTORE_REGS_STRING
+                       /* Skip flags entry */
+                       "       addq $8, %rsp\n"
+                       "       popfq\n"
+#else /* CONFIG_X86_32 */
+                       "       pushf\n"
+                       SAVE_REGS_STRING
+                       "       movl %esp, %edx\n"
+                       ".global optprobe_template_val\n"
+                       "optprobe_template_val: \n"
+                       ASM_NOP5
+                       ".global optprobe_template_call\n"
+                       "optprobe_template_call: \n"
+                       ASM_NOP5
+                       RESTORE_REGS_STRING
+                       "       addl $4, %esp\n"        /* skip cs */
+                       "       popf\n"
+#endif
+                       ".global optprobe_template_end\n"
+                       "optprobe_template_end: \n");
+}
+
+#define TMPL_MOVE_IDX \
+       ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+       ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+       ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op,
+                                        struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       preempt_disable();
+       if (kprobe_running()) {
+               kprobes_inc_nmissed_count(&op->kp);
+       } else {
+               /* Save skipped registers */
+#ifdef CONFIG_X86_64
+               regs->cs = __KERNEL_CS;
+#else
+               regs->cs = __KERNEL_CS | get_kernel_rpl();
+               regs->gs = 0;
+#endif
+               regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+               regs->orig_ax = ~0UL;
+
+               __get_cpu_var(current_kprobe) = &op->kp;
+               kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+               opt_pre_handler(&op->kp, regs);
+               __get_cpu_var(current_kprobe) = NULL;
+       }
+       preempt_enable_no_resched();
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+       int len = 0, ret;
+
+       while (len < RELATIVEJUMP_SIZE) {
+               ret = __copy_instruction(dest + len, src + len, 1);
+               if (!ret || !can_boost(dest + len))
+                       return -EINVAL;
+               len += ret;
+       }
+       /* Check whether the address range is reserved */
+       if (ftrace_text_reserved(src, src + len - 1) ||
+           alternatives_text_reserved(src, src + len - 1))
+               return -EBUSY;
+
+       return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+       return ((insn->opcode.bytes[0] == 0xff &&
+               (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+               insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+       unsigned long target = 0;
+
+       switch (insn->opcode.bytes[0]) {
+       case 0xe0:      /* loopne */
+       case 0xe1:      /* loope */
+       case 0xe2:      /* loop */
+       case 0xe3:      /* jcxz */
+       case 0xe9:      /* near relative jump */
+       case 0xeb:      /* short relative jump */
+               break;
+       case 0x0f:
+               if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+                       break;
+               return 0;
+       default:
+               if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+                       break;
+               return 0;
+       }
+       target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+       return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+       int ret;
+       unsigned long addr, size = 0, offset = 0;
+       struct insn insn;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+       /* Dummy buffers for lookup_symbol_attrs */
+       static char __dummy_buf[KSYM_NAME_LEN];
+
+       /* Lookup symbol including addr */
+       if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+               return 0;
+
+       /* Check there is enough space for a relative jump. */
+       if (size - offset < RELATIVEJUMP_SIZE)
+               return 0;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr - offset + size) { /* Decode until function end */
+               if (search_exception_tables(addr))
+                       /*
+                        * Since some fixup code will jumps into this function,
+                        * we can't optimize kprobe in this function.
+                        */
+                       return 0;
+               kernel_insn_init(&insn, (void *)addr);
+               insn_get_opcode(&insn);
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf, addr);
+                       if (ret)
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+               insn_get_length(&insn);
+               /* Recover address */
+               insn.kaddr = (void *)addr;
+               insn.next_byte = (void *)(addr + insn.length);
+               /* Check any instructions don't jump into target */
+               if (insn_is_indirect_jump(&insn) ||
+                   insn_jump_into_range(&insn, paddr + INT3_SIZE,
+                                        RELATIVE_ADDR_SIZE))
+                       return 0;
+               addr += insn.length;
+       }
+
+       return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+       int i;
+       struct kprobe *p;
+
+       for (i = 1; i < op->optinsn.size; i++) {
+               p = get_kprobe(op->kp.addr + i);
+               if (p && !kprobe_disabled(p))
+                       return -EEXIST;
+       }
+
+       return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
+                                          unsigned long addr)
+{
+       return ((unsigned long)op->kp.addr <= addr &&
+               (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+       if (op->optinsn.insn) {
+               free_optinsn_slot(op->optinsn.insn, dirty);
+               op->optinsn.insn = NULL;
+               op->optinsn.size = 0;
+       }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+       __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+       u8 *buf;
+       int ret;
+       long rel;
+
+       if (!can_optimize((unsigned long)op->kp.addr))
+               return -EILSEQ;
+
+       op->optinsn.insn = get_optinsn_slot();
+       if (!op->optinsn.insn)
+               return -ENOMEM;
+
+       /*
+        * Verify if the address gap is in 2GB range, because this uses
+        * a relative jump.
+        */
+       rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+       if (abs(rel) > 0x7fffffff)
+               return -ERANGE;
+
+       buf = (u8 *)op->optinsn.insn;
+
+       /* Copy instructions into the out-of-line buffer */
+       ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+       if (ret < 0) {
+               __arch_remove_optimized_kprobe(op, 0);
+               return ret;
+       }
+       op->optinsn.size = ret;
+
+       /* Copy arch-dep-instance from template */
+       memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+       /* Set probe information */
+       synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+       /* Set probe function call */
+       synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+       /* Set returning jmp instruction at the tail of out-of-line buffer */
+       synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+                          (u8 *)op->kp.addr + op->optinsn.size);
+
+       flush_icache_range((unsigned long) buf,
+                          (unsigned long) buf + TMPL_END_IDX +
+                          op->optinsn.size + RELATIVEJUMP_SIZE);
+       return 0;
+}
+
+/* Replace a breakpoint (int3) with a relative jump.  */
+int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+{
+       unsigned char jmp_code[RELATIVEJUMP_SIZE];
+       s32 rel = (s32)((long)op->optinsn.insn -
+                       ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+       /* Backup instructions which will be replaced by jump address */
+       memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+              RELATIVE_ADDR_SIZE);
+
+       jmp_code[0] = RELATIVEJUMP_OPCODE;
+       *(s32 *)(&jmp_code[1]) = rel;
+
+       /*
+        * text_poke_smp doesn't support NMI/MCE code modifying.
+        * However, since kprobes itself also doesn't support NMI/MCE
+        * code probing, it's not a problem.
+        */
+       text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
+       return 0;
+}
+
+/* Replace a relative jump with a breakpoint (int3).  */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+       u8 buf[RELATIVEJUMP_SIZE];
+
+       /* Set int3 to first byte for kprobes */
+       buf[0] = BREAKPOINT_INSTRUCTION;
+       memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+       text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+static int  __kprobes setup_detour_execution(struct kprobe *p,
+                                            struct pt_regs *regs,
+                                            int reenter)
+{
+       struct optimized_kprobe *op;
+
+       if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+               /* This kprobe is really able to run optimized path. */
+               op = container_of(p, struct optimized_kprobe, kp);
+               /* Detour through copied instructions */
+               regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+               if (!reenter)
+                       reset_current_kprobe();
+               preempt_enable_no_resched();
+               return 1;
+       }
+       return 0;
+}
+#endif
+
 int __init arch_init_kprobes(void)
 {
        return 0;
index 33a4ff4..b8c59b8 100644 (file)
@@ -78,7 +78,6 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
        walk->data -= walk->offset;
 
        if (nbytes && walk->offset & alignmask && !err) {
-               walk->offset += alignmask - 1;
                walk->offset = ALIGN(walk->offset, alignmask + 1);
                walk->data += walk->offset;
 
index 1887090..2bb7348 100644 (file)
@@ -386,11 +386,13 @@ static int crypto_authenc_encrypt(struct aead_request *req)
 {
        struct crypto_aead *authenc = crypto_aead_reqtfm(req);
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
-       struct ablkcipher_request *abreq = aead_request_ctx(req);
+       struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
        struct crypto_ablkcipher *enc = ctx->enc;
        struct scatterlist *dst = req->dst;
        unsigned int cryptlen = req->cryptlen;
-       u8 *iv = (u8 *)(abreq + 1) + crypto_ablkcipher_reqsize(enc);
+       struct ablkcipher_request *abreq = (void *)(areq_ctx->tail
+                                                   + ctx->reqoff);
+       u8 *iv = (u8 *)abreq - crypto_ablkcipher_ivsize(enc);
        int err;
 
        ablkcipher_request_set_tfm(abreq, enc);
@@ -454,7 +456,7 @@ static int crypto_authenc_verify(struct aead_request *req,
        unsigned int authsize;
 
        areq_ctx->complete = authenc_verify_ahash_done;
-       areq_ctx->complete = authenc_verify_ahash_update_done;
+       areq_ctx->update_complete = authenc_verify_ahash_update_done;
 
        ohash = authenc_ahash_fn(req, CRYPTO_TFM_REQ_MAY_SLEEP);
        if (IS_ERR(ohash))
@@ -546,10 +548,6 @@ static int crypto_authenc_init_tfm(struct crypto_tfm *tfm)
        if (IS_ERR(auth))
                return PTR_ERR(auth);
 
-       ctx->reqoff = ALIGN(2 * crypto_ahash_digestsize(auth) +
-                           crypto_ahash_alignmask(auth),
-                           crypto_ahash_alignmask(auth) + 1);
-
        enc = crypto_spawn_skcipher(&ictx->enc);
        err = PTR_ERR(enc);
        if (IS_ERR(enc))
@@ -558,13 +556,18 @@ static int crypto_authenc_init_tfm(struct crypto_tfm *tfm)
        ctx->auth = auth;
        ctx->enc = enc;
 
-       tfm->crt_aead.reqsize = max_t(unsigned int,
-                               crypto_ahash_reqsize(auth) + ctx->reqoff +
-                               sizeof(struct authenc_request_ctx) +
+       ctx->reqoff = ALIGN(2 * crypto_ahash_digestsize(auth) +
+                           crypto_ahash_alignmask(auth),
+                           crypto_ahash_alignmask(auth) + 1) +
+                     crypto_ablkcipher_ivsize(enc);
+
+       tfm->crt_aead.reqsize = sizeof(struct authenc_request_ctx) +
+                               ctx->reqoff +
+                               max_t(unsigned int,
+                               crypto_ahash_reqsize(auth) +
                                sizeof(struct ahash_request),
                                sizeof(struct skcipher_givcrypt_request) +
-                               crypto_ablkcipher_reqsize(enc) +
-                               crypto_ablkcipher_ivsize(enc));
+                               crypto_ablkcipher_reqsize(enc));
 
        return 0;
 
index 9fda213..30efc7d 100644 (file)
@@ -234,6 +234,7 @@ static struct shash_alg alg = {
        .export         =       md5_export,
        .import         =       md5_import,
        .descsize       =       sizeof(struct md5_state),
+       .statesize      =       sizeof(struct md5_state),
        .base           =       {
                .cra_name       =       "md5",
                .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
index 22bc743..d2f37a5 100644 (file)
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                /* If checksum is bad mark all blocks used to prevent allocation
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                       ext4_error(sb, __func__,
-                                 "Checksum bad for group %u", block_group);
+                       ext4_error(sb, "Checksum bad for group %u",
+                                       block_group);
                        ext4_free_blks_set(sb, gdp, 0);
                        ext4_free_inodes_set(sb, gdp, 0);
                        ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * to make sure we calculate the right free blocks
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
-                       le32_to_cpu(sbi->s_es->s_first_data_block) -
-                       (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
+                       ext4_group_first_block_no(sb, ngroups - 1);
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * when a file system is mounted (see ext4_fill_super).
  */
 
-
-#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
-
 /**
  * ext4_get_group_desc() -- load group descriptor from disk
  * @sb:                        super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
        if (block_group >= ngroups) {
-               ext4_error(sb, "ext4_get_group_desc",
-                          "block_group >= groups_count - "
-                          "block_group = %u, groups_count = %u",
-                          block_group, ngroups);
+               ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+                          " groups_count = %u", block_group, ngroups);
 
                return NULL;
        }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        if (!sbi->s_group_desc[group_desc]) {
-               ext4_error(sb, "ext4_get_group_desc",
-                          "Group descriptor not loaded - "
+               ext4_error(sb, "Group descriptor not loaded - "
                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
                return 1;
 
 err_out:
-       ext4_error(sb, __func__,
-                       "Invalid block bitmap - "
-                       "block_group = %d, block = %llu",
+       ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
                        block_group, bitmap_blk);
        return 0;
 }
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_block_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-               ext4_error(sb, __func__,
-                           "Cannot read block bitmap - "
+               ext4_error(sb, "Cannot read block bitmap - "
                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
-               ext4_error(sb, __func__,
-                           "Cannot read block bitmap - "
+               ext4_error(sb, "Cannot read block bitmap - "
                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-               ext4_error(sb, __func__,
-                          "Adding blocks in system zones - "
+               ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
-                       ext4_error(sb, __func__,
-                                  "bit already cleared for block %llu",
+                       ext4_error(sb, "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
index a60ab9a..983f0e1 100644 (file)
@@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb)
                entry = rb_entry(n, struct ext4_system_zone, node);
                kmem_cache_free(ext4_system_zone_cachep, entry);
                if (!parent)
-                       EXT4_SB(sb)->system_blks.rb_node = NULL;
+                       EXT4_SB(sb)->system_blks = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
                        parent->rb_right = NULL;
                n = parent;
        }
-       EXT4_SB(sb)->system_blks.rb_node = NULL;
+       EXT4_SB(sb)->system_blks = RB_ROOT;
 }
 
 /*
index 9dc9316..86cb6d8 100644 (file)
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
 
        if (error_msg != NULL)
-               ext4_error(dir->i_sb, function,
-                       "bad entry in directory #%lu: %s - "
-                       "offset=%u, inode=%u, rec_len=%d, name_len=%d",
-                       dir->i_ino, error_msg, offset,
+               __ext4_error(dir->i_sb, function,
+                       "bad entry in directory #%lu: %s - block=%llu"
+                       "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                       dir->i_ino, error_msg, 
+                       (unsigned long long) bh->b_blocknr,     
+                       (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                               ext4_error(sb, __func__, "directory #%lu "
+                               ext4_error(sb, "directory #%lu "
                                           "contains a hole at offset %Lu",
                                           inode->i_ino,
                                           (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                        kfree(old);
                }
                if (!parent)
-                       root->rb_node = NULL;
+                       *root = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
index 50af1a2..bf938cf 100644 (file)
 #define ext4_debug(f, a...)    do {} while (0)
 #endif
 
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+       ext4_error_inode(__func__, (inode), (fmt), ## a);
+
+#define EXT4_ERROR_FILE(file, fmt, a...)       \
+       ext4_error_file(__func__, (file), (fmt), ## a);
+
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
@@ -133,14 +139,14 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define        DIO_AIO_UNWRITTEN       0x1
+#define        EXT4_IO_UNWRITTEN       0x1
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished AIO list */
        struct inode            *inode;         /* file being written to */
        unsigned int            flag;           /* unwritten or not */
-       int                     error;          /* I/O error code */
-       ext4_lblk_t             offset;         /* offset in the file */
-       size_t                  size;           /* size of the extent */
+       struct page             *page;          /* page struct for buffer write */
+       loff_t                  offset;         /* offset in the file */
+       ssize_t                 size;           /* size of the extent */
        struct work_struct      work;           /* data work queue */
 } ext4_io_end_t;
 
@@ -284,10 +290,12 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL                 0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL               0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL              0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_RESERVED_FL               0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE           0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE                0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE           0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE                0x004B80FF /* User modifiable flags */
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
                return flags & EXT4_OTHER_FLMASK;
 }
 
-/*
- * Inode dynamic state flags
- */
-#define EXT4_STATE_JDATA               0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW                 0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR               0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND           0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE      0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE         0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN       0x00000040 /* need convert on dio done*/
-
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
        __u32 group;            /* Group number for this data */
@@ -364,19 +361,20 @@ struct ext4_new_group_data {
        /* caller is from the direct IO path, request to creation of an
        unitialized extents if not allocated, split the uninitialized
        extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO                    0x0008
+#define EXT4_GET_BLOCKS_PRE_IO                 0x0008
 #define EXT4_GET_BLOCKS_CONVERT                        0x0010
-#define EXT4_GET_BLOCKS_DIO_CREATE_EXT         (EXT4_GET_BLOCKS_DIO|\
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT          (EXT4_GET_BLOCKS_PRE_IO|\
+                                        EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+       /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-       /* Convert extent to initialized after direct IO complete */
-#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT|\
-                                        EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 
 /*
  * Flags used by ext4_free_blocks
  */
 #define EXT4_FREE_BLOCKS_METADATA      0x0001
 #define EXT4_FREE_BLOCKS_FORGET                0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED     0x0004
 
 /*
  * ioctl commands
@@ -630,7 +628,7 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
-       __u32   i_state;                /* Dynamic state flags for ext4 */
+       unsigned long   i_state_flags;          /* Dynamic state flags */
 
        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -708,8 +706,9 @@ struct ext4_inode_info {
        qsize_t i_reserved_quota;
 #endif
 
-       /* completed async DIOs that might need unwritten extents handling */
-       struct list_head i_aio_dio_complete_list;
+       /* completed IOs that might need unwritten extents handling */
+       struct list_head i_completed_io_list;
+       spinlock_t i_completed_io_lock;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
 
@@ -760,6 +759,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA               0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA            0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA            0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK      0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+       EXT4_STATE_JDATA,               /* journaled data exists */
+       EXT4_STATE_NEW,                 /* inode is newly created */
+       EXT4_STATE_XATTR,               /* has in-inode xattrs */
+       EXT4_STATE_NO_EXPAND,           /* No space for expansion */
+       EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
+       EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
+       EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+};
+
+static inline int ext4_test_inode_state(struct inode *inode, int bit)
+{
+       return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_set_inode_state(struct inode *inode, int bit)
+{
+       set_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+{
+       clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_FEATURE_INCOMPAT_64BIT            0x0080
 #define EXT4_FEATURE_INCOMPAT_MMP               0x0100
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG          0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE         0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA          0x1000 /* data in dirent */
 
 #define EXT4_FEATURE_COMPAT_SUPP       EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 /* ioctl.c */
@@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
                                ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+#define ext4_error(sb, message...)     __ext4_error(sb, __func__, ## message)
+extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+extern void ext4_error_file(const char *, struct file *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void __ext4_std_error(struct super_block *, const char *, int);
 extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning(struct super_block *, const char *, const char *, ...)
+extern void __ext4_warning(struct super_block *, const char *,
+                         const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+#define ext4_warning(sb, message...)   __ext4_warning(sb, __func__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-                         loff_t len);
+                         ssize_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           sector_t block, unsigned int max_blocks,
                           struct buffer_head *bh, int flags);
@@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 len, __u64 *moved_len);
 
 
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+       BH_Uninit       /* blocks are allocated but uninitialized on disk */
+         = BH_JBDPrivateStart,
+};
+
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
+
 /*
  * Add new method to test wether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
@@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
 
+#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
+
 #endif /* __KERNEL__ */
 
 #endif /* _EXT4_H */
index b57e5c7..53d2764 100644 (file)
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        } else {
-               if (inode && bh)
+               if (inode)
                        mark_buffer_dirty_inode(bh, inode);
                else
                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                               ext4_error(inode->i_sb, __func__,
+                               ext4_error(inode->i_sb,
                                           "IO error syncing inode, "
                                           "inode=%lu, block=%llu",
                                           inode->i_ino,
index 05eca81..b79ad51 100644 (file)
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
        return 0;
 }
 
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work for nobh or if data journaling is
+ * enabled, since the dioread_nolock code uses b_private to pass
+ * information back to the I/O completion handler, and this conflicts
+ * with the jbd's use of b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+       if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+               return 0;
+       if (test_opt(inode->i_sb, NOBH))
+               return 0;
+       if (!S_ISREG(inode->i_mode))
+               return 0;
+       if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+               return 0;
+       if (ext4_should_journal_data(inode))
+               return 0;
+       return 1;
+}
+
 #endif /* _EXT4_JBD2_H */
index 765a482..94c8ee8 100644 (file)
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                if (S_ISREG(inode->i_mode))
                        block_group++;
        }
-       bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
-               le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+       bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
        /*
@@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 
 corrupted:
-       ext4_error(inode->i_sb, function,
+       __ext4_error(inode->i_sb, function,
                        "bad header/extent in inode #%lu: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                }
                eh = ext_block_hdr(bh);
                ppos++;
-               BUG_ON(ppos > depth);
+               if (unlikely(ppos > depth)) {
+                       put_bh(bh);
+                       EXT4_ERROR_INODE(inode,
+                                        "ppos %d > depth %d", ppos, depth);
+                       goto err;
+               }
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
                i--;
@@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
        if (err)
                return err;
 
-       BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+       if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+               EXT4_ERROR_INODE(inode,
+                                "logical %d == ei_block %d!",
+                                logical, le32_to_cpu(curp->p_idx->ei_block));
+               return -EIO;
+       }
        len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
@@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 
-       BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
-                            > le16_to_cpu(curp->p_hdr->eh_max));
-       BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+       if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+                            > le16_to_cpu(curp->p_hdr->eh_max))) {
+               EXT4_ERROR_INODE(inode,
+                                "logical %d == ei_block %d!",
+                                logical, le32_to_cpu(curp->p_idx->ei_block));
+               return -EIO;
+       }
+       if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+               EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+               return -EIO;
+       }
 
        err = ext4_ext_dirty(handle, inode, curp);
        ext4_std_error(inode->i_sb, err);
@@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
        /* if current leaf will be split, then we should use
         * border from split point */
-       BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+       if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+               EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+               return -EIO;
+       }
        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
                border = path[depth].p_ext[1].ee_block;
                ext_debug("leaf will be split."
@@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
        /* initialize new leaf */
        newblock = ablocks[--a];
-       BUG_ON(newblock == 0);
+       if (unlikely(newblock == 0)) {
+               EXT4_ERROR_INODE(inode, "newblock == 0!");
+               err = -EIO;
+               goto cleanup;
+       }
        bh = sb_getblk(inode->i_sb, newblock);
        if (!bh) {
                err = -EIO;
@@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        ex = EXT_FIRST_EXTENT(neh);
 
        /* move remainder of path[depth] to the new leaf */
-       BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+       if (unlikely(path[depth].p_hdr->eh_entries !=
+                    path[depth].p_hdr->eh_max)) {
+               EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+                                path[depth].p_hdr->eh_entries,
+                                path[depth].p_hdr->eh_max);
+               err = -EIO;
+               goto cleanup;
+       }
        /* start copy from next extent */
        /* TODO: we could do it by single memmove */
        m = 0;
@@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
        /* create intermediate indexes */
        k = depth - at - 1;
-       BUG_ON(k < 0);
+       if (unlikely(k < 0)) {
+               EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+               err = -EIO;
+               goto cleanup;
+       }
        if (k)
                ext_debug("create %d intermediate indices\n", k);
        /* insert new index into current index block */
@@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
                                EXT_MAX_INDEX(path[i].p_hdr));
-               BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
-                               EXT_LAST_INDEX(path[i].p_hdr));
+               if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+                                       EXT_LAST_INDEX(path[i].p_hdr))) {
+                       EXT4_ERROR_INODE(inode,
+                                        "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+                                        le32_to_cpu(path[i].p_ext->ee_block));
+                       err = -EIO;
+                       goto cleanup;
+               }
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        struct ext4_extent *ex;
        int depth, ee_len;
 
-       BUG_ON(path == NULL);
+       if (unlikely(path == NULL)) {
+               EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+               return -EIO;
+       }
        depth = path->p_depth;
        *phys = 0;
 
@@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
-               BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+               if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+                       EXT4_ERROR_INODE(inode,
+                                        "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+                                        *logical, le32_to_cpu(ex->ee_block));
+                       return -EIO;
+               }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
-                       BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+                       if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+                               EXT4_ERROR_INODE(inode,
+                                 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+                                 ix != NULL ? ix->ei_block : 0,
+                                 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+                                   EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+                                 depth);
+                               return -EIO;
+                       }
                }
                return 0;
        }
 
-       BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+       if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+               EXT4_ERROR_INODE(inode,
+                                "logical %d < ee_block %d + ee_len %d!",
+                                *logical, le32_to_cpu(ex->ee_block), ee_len);
+               return -EIO;
+       }
 
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
        *phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        int depth;      /* Note, NOT eh_depth; depth from top of tree */
        int ee_len;
 
-       BUG_ON(path == NULL);
+       if (unlikely(path == NULL)) {
+               EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+               return -EIO;
+       }
        depth = path->p_depth;
        *phys = 0;
 
@@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
-               BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+               if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+                       EXT4_ERROR_INODE(inode,
+                                        "first_extent(path[%d].p_hdr) != ex",
+                                        depth);
+                       return -EIO;
+               }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
-                       BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+                       if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+                               EXT4_ERROR_INODE(inode,
+                                                "ix != EXT_FIRST_INDEX *logical %d!",
+                                                *logical);
+                               return -EIO;
+                       }
                }
                *logical = le32_to_cpu(ex->ee_block);
                *phys = ext_pblock(ex);
                return 0;
        }
 
-       BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+       if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+               EXT4_ERROR_INODE(inode,
+                                "logical %d < ee_block %d + ee_len %d!",
+                                *logical, le32_to_cpu(ex->ee_block), ee_len);
+               return -EIO;
+       }
 
        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
@@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
-       BUG_ON(ex == NULL);
-       BUG_ON(eh == NULL);
+
+       if (unlikely(ex == NULL || eh == NULL)) {
+               EXT4_ERROR_INODE(inode,
+                                "ex %p == NULL or eh %p == NULL", ex, eh);
+               return -EIO;
+       }
 
        if (depth == 0) {
                /* there is no tree at all */
@@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
-                       ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
-                          "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+                       ext4_error(inode->i_sb,
+                                  "inode#%lu, eh->eh_entries = 0!",
+                                  inode->i_ino);
        }
 
        return merge_done;
@@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        ext4_lblk_t next;
        unsigned uninitialized = 0;
 
-       BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+       if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+               EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+               return -EIO;
+       }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
-       BUG_ON(path[depth].p_hdr == NULL);
+       if (unlikely(path[depth].p_hdr == NULL)) {
+               EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+               return -EIO;
+       }
 
        /* try to insert block into found extent and return */
-       if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+       if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
                                ext4_ext_is_uninitialized(newext),
@@ -1739,7 +1830,7 @@ has_space:
 
 merge:
        /* try to merge extents to the right */
-       if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+       if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
                ext4_ext_try_to_merge(inode, path, nearex);
 
        /* try to merge extents to the left */
@@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                }
 
                depth = ext_depth(inode);
-               BUG_ON(path[depth].p_hdr == NULL);
+               if (unlikely(path[depth].p_hdr == NULL)) {
+                       EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+                       err = -EIO;
+                       break;
+               }
                ex = path[depth].p_ext;
                next = ext4_ext_next_allocated_block(path);
 
@@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
 
-               BUG_ON(cbex.ec_len == 0);
+               if (unlikely(cbex.ec_len == 0)) {
+                       EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+                       err = -EIO;
+                       break;
+               }
                err = func(inode, path, &cbex, ex, cbdata);
                ext4_ext_drop_refs(path);
 
@@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 
        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
-       if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+       if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
                ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        /* free index block */
        path--;
        leaf = idx_pblock(path->p_idx);
-       BUG_ON(path->p_hdr->eh_entries == 0);
+       if (unlikely(path->p_hdr->eh_entries == 0)) {
+               EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+               return -EIO;
+       }
        err = ext4_ext_get_access(handle, inode, path);
        if (err)
                return err;
@@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
-       BUG_ON(eh == NULL);
-
+       if (unlikely(path[depth].p_hdr == NULL)) {
+               EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+               return -EIO;
+       }
        /* find where to start removing */
        ex = EXT_LAST_EXTENT(eh);
 
@@ -2983,7 +3087,7 @@ fix_extent_len:
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
 }
-static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                              struct inode *inode,
                                              struct ext4_ext_path *path)
 {
@@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
 
-       /* DIO get_block() before submit the IO, split the extent */
-       if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+       /* get_block() before submit the IO, split the extent */
+       if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                ret = ext4_split_unwritten_extents(handle,
                                                inode, path, iblock,
                                                max_blocks, flags);
@@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * completed
                 */
                if (io)
-                       io->flag = DIO_AIO_UNWRITTEN;
+                       io->flag = EXT4_IO_UNWRITTEN;
                else
-                       EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+                       ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+               if (ext4_should_dioread_nolock(inode))
+                       set_buffer_uninit(bh_result);
                goto out;
        }
-       /* async DIO end_io complete, convert the filled extent to written */
-       if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
-               ret = ext4_convert_unwritten_extents_dio(handle, inode,
+       /* IO end_io complete, convert the filled extent to written */
+       if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+               ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
                if (ret >= 0)
                        ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-       struct ext4_extent newex, *ex;
+       struct ext4_extent newex, *ex, *last_ex;
        ext4_fsblk_t newblock;
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
@@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * this situation is possible, though, _during_ tree modification;
         * this is why assert can't be put in ext4_ext_find_extent()
         */
-       if (path[depth].p_ext == NULL && depth != 0) {
-               ext4_error(inode->i_sb, __func__, "bad extent address "
-                          "inode: %lu, iblock: %d, depth: %d",
-                          inode->i_ino, iblock, depth);
+       if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+               EXT4_ERROR_INODE(inode, "bad extent address "
+                                "iblock: %d, depth: %d pblock %lld",
+                                iblock, depth, path[depth].p_block);
                err = -EIO;
                goto out2;
        }
@@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 */
                ee_len = ext4_ext_get_actual_len(ex);
                /* if found extent covers block, simply return it */
-               if (iblock >= ee_block && iblock < ee_block + ee_len) {
+               if (in_range(iblock, ee_block, ee_len)) {
                        newblock = iblock - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (iblock - ee_block);
@@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
                ext4_ext_mark_uninitialized(&newex);
                /*
-                * io_end structure was created for every async
-                * direct IO write to the middle of the file.
-                * To avoid unecessary convertion for every aio dio rewrite
-                * to the mid of file, here we flag the IO that is really
-                * need the convertion.
+                * io_end structure was created for every IO write to an
+                * uninitialized extent. To avoid unecessary conversion,
+                * here we flag the IO that really needs the conversion.
                 * For non asycn direct IO case, flag the inode state
                 * that we need to perform convertion when IO is done.
                 */
-               if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+               if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                               io->flag = DIO_AIO_UNWRITTEN;
+                               io->flag = EXT4_IO_UNWRITTEN;
                        else
-                               EXT4_I(inode)->i_state |=
-                                       EXT4_STATE_DIO_UNWRITTEN;;
+                               ext4_set_inode_state(inode,
+                                                    EXT4_STATE_DIO_UNWRITTEN);
+               }
+               if (ext4_should_dioread_nolock(inode))
+                       set_buffer_uninit(bh_result);
+       }
+
+       if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+               if (unlikely(!eh->eh_entries)) {
+                       EXT4_ERROR_INODE(inode,
+                                        "eh->eh_entries == 0 ee_block %d",
+                                        ex->ee_block);
+                       err = -EIO;
+                       goto out2;
                }
+               last_ex = EXT_LAST_EXTENT(eh);
+               if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                   + ext4_ext_get_actual_len(last_ex))
+                       EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
@@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
                        i_size_write(inode, new_size);
                if (new_size > EXT4_I(inode)->i_disksize)
                        ext4_update_i_disksize(inode, new_size);
+       } else {
+               /*
+                * Mark that we allocate beyond EOF so the subsequent truncate
+                * can proceed even if the new size is the same as i_size.
+                */
+               if (new_size > i_size_read(inode))
+                       EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
        }
 
 }
@@ -3603,7 +3730,7 @@ retry:
  * Returns 0 on success.
  */
 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-                                   loff_t len)
+                                   ssize_t len)
 {
        handle_t *handle;
        ext4_lblk_t block;
@@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                map_bh.b_state = 0;
                ret = ext4_get_blocks(handle, inode, block,
                                      max_blocks, &map_bh,
-                                     EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+                                     EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
        int error = 0;
 
        /* in-inode? */
-       if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+       if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct ext4_iloc iloc;
                int offset;     /* offset of xattr in inode */
 
@@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
        ext4_lblk_t start_blk;
-       ext4_lblk_t len_blks;
        int error = 0;
 
        /* fallback to generic here if not in extents fmt */
@@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                error = ext4_xattr_fiemap(inode, fieinfo);
        } else {
+               ext4_lblk_t len_blks;
+               __u64 last_blk;
+
                start_blk = start >> inode->i_sb->s_blocksize_bits;
-               len_blks = len >> inode->i_sb->s_blocksize_bits;
+               last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+               if (last_blk >= EXT_MAX_BLOCK)
+                       last_blk = EXT_MAX_BLOCK-1;
+               len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
 
                /*
                 * Walk the extent tree gathering extent information.
index 56eee3d..503a489 100644 (file)
@@ -35,9 +35,9 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
-       if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+       if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
                ext4_alloc_da_blocks(inode);
-               EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+               ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
index 98bd140..0d0c323 100644 (file)
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
 
-       ret = flush_aio_dio_completed_IO(inode);
+       ret = flush_completed_IO(inode);
        if (ret < 0)
                return ret;
        
index f3624ea..004c9da 100644 (file)
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-               ext4_error(sb, __func__, "Checksum bad for group %u",
-                          block_group);
+               ext4_error(sb, "Checksum bad for group %u", block_group);
                ext4_free_blks_set(sb, gdp, 0);
                ext4_free_inodes_set(sb, gdp, 0);
                ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-               ext4_error(sb, __func__,
-                           "Cannot read inode bitmap - "
+               ext4_error(sb, "Cannot read inode bitmap - "
                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
-               ext4_error(sb, __func__,
-                           "Cannot read inode bitmap - "
+               ext4_error(sb, "Cannot read inode bitmap - "
                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
        es = EXT4_SB(sb)->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-               ext4_error(sb, "ext4_free_inode",
-                          "reserved or nonexistent inode %lu", ino);
+               ext4_error(sb, "reserved or nonexistent inode %lu", ino);
                goto error_return;
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                        bit, bitmap_bh->b_data);
        if (!cleared)
-               ext4_error(sb, "ext4_free_inode",
-                          "bit already cleared for inode %lu", ino);
+               ext4_error(sb, "bit already cleared for inode %lu", ino);
        else {
                gdp = ext4_get_group_desc(sb, block_group, &bh2);
 
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
-               ext4_error(sb, __func__,
-                          "reserved inode or inode > inodes count - "
+               ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
                return 1;
@@ -904,7 +898,7 @@ repeat_in_this_group:
                                BUFFER_TRACE(inode_bitmap_bh,
                                        "call ext4_handle_dirty_metadata");
                                err = ext4_handle_dirty_metadata(handle,
-                                                                inode,
+                                                                NULL,
                                                        inode_bitmap_bh);
                                if (err)
                                        goto fail;
@@ -1029,7 +1023,8 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
 
-       ei->i_state = EXT4_STATE_NEW;
+       ei->i_state_flags = 0;
+       ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 
        /* Error cases - e2fsck has already cleaned up for us */
        if (ino > max_ino) {
-               ext4_warning(sb, __func__,
-                            "bad orphan ino %lu!  e2fsck was run?", ino);
+               ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
                goto error;
        }
 
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (!bitmap_bh) {
-               ext4_warning(sb, __func__,
-                            "inode bitmap error for orphan %lu", ino);
+               ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
                goto error;
        }
 
@@ -1140,8 +1133,7 @@ iget_failed:
        err = PTR_ERR(inode);
        inode = NULL;
 bad_orphan:
-       ext4_warning(sb, __func__,
-                    "bad orphan inode %lu!  e2fsck was run?", ino);
+       ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
        printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
               bit, (unsigned long long)bitmap_bh->b_blocknr,
               ext4_test_bit(bit, bitmap_bh->b_data));
index d01a6cd..f977aad 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/kernel.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -194,7 +195,7 @@ void ext4_delete_inode(struct inode *inode)
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
-               ext4_warning(inode->i_sb, __func__,
+               ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
@@ -212,7 +213,7 @@ void ext4_delete_inode(struct inode *inode)
                if (err > 0)
                        err = ext4_journal_restart(handle, 3);
                if (err != 0) {
-                       ext4_warning(inode->i_sb, __func__,
+                       ext4_warning(inode->i_sb,
                                     "couldn't extend journal (err %d)", err);
                stop_handle:
                        ext4_journal_stop(handle);
@@ -323,8 +324,7 @@ static int ext4_block_to_path(struct inode *inode,
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
-               ext4_warning(inode->i_sb, "ext4_block_to_path",
-                            "block %lu > max in inode %lu",
+               ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
                             i_block + direct_blocks +
                             indirect_blocks + double_blocks, inode->i_ino);
        }
@@ -344,7 +344,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                       ext4_error(inode->i_sb, function,
+                       __ext4_error(inode->i_sb, function,
                                   "invalid block reference %u "
                                   "in inode #%lu", blk, inode->i_ino);
                        return -EIO;
@@ -607,7 +607,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                if (*err)
                        goto failed_out;
 
-               BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+               if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                       EXT4_ERROR_INODE(inode,
+                                        "current_block %llu + count %lu > %d!",
+                                        current_block, count,
+                                        EXT4_MAX_BLOCK_FILE_PHYS);
+                       *err = -EIO;
+                       goto failed_out;
+               }
 
                target -= count;
                /* allocate blocks for indirect blocks */
@@ -643,7 +650,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                ar.flags = EXT4_MB_HINT_DATA;
 
        current_block = ext4_mb_new_blocks(handle, &ar, err);
-       BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+       if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+               EXT4_ERROR_INODE(inode,
+                                "current_block %llu + ar.len %d > %d!",
+                                current_block, ar.len,
+                                EXT4_MAX_BLOCK_FILE_PHYS);
+               *err = -EIO;
+               goto failed_out;
+       }
 
        if (*err && (target == blks)) {
                /*
@@ -1061,6 +1075,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        int mdb_free = 0, allocated_meta_blocks = 0;
 
        spin_lock(&ei->i_block_reservation_lock);
+       trace_ext4_da_update_reserve_space(inode, used);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks\n",
@@ -1124,7 +1139,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
                                sector_t logical, sector_t phys, int len)
 {
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-               ext4_error(inode->i_sb, msg,
+               __ext4_error(inode->i_sb, msg,
                           "inode #%lu logical block %llu mapped to %llu "
                           "(size %d)", inode->i_ino,
                           (unsigned long long) logical,
@@ -1306,7 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
-                       EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+                       ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
                }
 
                /*
@@ -1534,6 +1549,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
        ext4_truncate(inode);
 }
 
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+                  struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@ -1575,8 +1592,12 @@ retry:
        }
        *pagep = page;
 
-       ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                               ext4_get_block);
+       if (ext4_should_dioread_nolock(inode))
+               ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+                               fsdata, ext4_get_block_write);
+       else
+               ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+                               fsdata, ext4_get_block);
 
        if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
@@ -1793,7 +1814,7 @@ static int ext4_journalled_write_end(struct file *file,
        new_i_size = pos + copied;
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
-       EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+       ext4_set_inode_state(inode, EXT4_STATE_JDATA);
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
                ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1846,6 +1867,7 @@ repeat:
        spin_lock(&ei->i_block_reservation_lock);
        md_reserved = ei->i_reserved_meta_blocks;
        md_needed = ext4_calc_metadata_amount(inode, lblock);
+       trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
 
        /*
@@ -2091,6 +2113,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
 
+                               if (buffer_uninit(exbh))
+                                       set_buffer_uninit(bh);
                                cur_logical++;
                                pblock++;
                        } while ((bh = bh->b_this_page) != head);
@@ -2133,17 +2157,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                       index = page->index;
-                       if (index > end)
+                       if (page->index > end)
                                break;
-                       index++;
-
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
                        block_invalidatepage(page, 0);
                        ClearPageUptodate(page);
                        unlock_page(page);
                }
+               index = pvec.pages[nr_pages - 1]->index + 1;
+               pagevec_release(&pvec);
        }
        return;
 }
@@ -2220,6 +2243,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         */
        new.b_state = 0;
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+       if (ext4_should_dioread_nolock(mpd->inode))
+               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 
@@ -2630,11 +2655,14 @@ static int __ext4_journalled_writepage(struct page *page,
                ret = err;
 
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
-       EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+       ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
        return ret;
 }
 
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2682,7 +2710,7 @@ static int ext4_writepage(struct page *page,
        int ret = 0;
        loff_t size;
        unsigned int len;
-       struct buffer_head *page_bufs;
+       struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
 
        trace_ext4_writepage(inode, page);
@@ -2758,7 +2786,11 @@ static int ext4_writepage(struct page *page,
 
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-       else
+       else if (page_bufs && buffer_uninit(page_bufs)) {
+               ext4_set_bh_endio(page_bufs, inode);
+               ret = block_write_full_page_endio(page, noalloc_get_block_write,
+                                           wbc, ext4_end_io_buffer_write);
+       } else
                ret = block_write_full_page(page, noalloc_get_block_write,
                                            wbc);
 
@@ -3301,7 +3333,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                filemap_write_and_wait(mapping);
        }
 
-       if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+       if (EXT4_JOURNAL(inode) &&
+           ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3353,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                 * everything they get.
                 */
 
-               EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+               ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
                journal = EXT4_JOURNAL(inode);
                jbd2_journal_lock_updates(journal);
                err = jbd2_journal_flush(journal);
@@ -3345,11 +3378,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+       BUG_ON(!io);
+       if (io->page)
+               put_page(io->page);
+       iput(io->inode);
+       kfree(io);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+       struct buffer_head *head, *bh;
+       unsigned int curr_off = 0;
+
+       if (!page_has_buffers(page))
+               return;
+       head = bh = page_buffers(page);
+       do {
+               if (offset <= curr_off && test_clear_buffer_uninit(bh)
+                                       && bh->b_private) {
+                       ext4_free_io_end(bh->b_private);
+                       bh->b_private = NULL;
+                       bh->b_end_io = NULL;
+               }
+               curr_off = curr_off + bh->b_size;
+               bh = bh->b_this_page;
+       } while (bh != head);
+}
+
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
        /*
+        * free any io_end structure allocated for buffers to be discarded
+        */
+       if (ext4_should_dioread_nolock(page->mapping->host))
+               ext4_invalidatepage_free_endio(page, offset);
+       /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0)
@@ -3420,7 +3487,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
        }
 
 retry:
-       ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+       if (rw == READ && ext4_should_dioread_nolock(inode))
+               ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+                                inode->i_sb->s_bdev, iov,
+                                offset, nr_segs,
+                                ext4_get_block, NULL);
+       else
+               ret = blockdev_direct_IO(rw, iocb, inode,
+                                inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3436,6 +3510,9 @@ retry:
                         * but cannot extend i_size. Bail out and pretend
                         * the write failed... */
                        ret = PTR_ERR(handle);
+                       if (inode->i_nlink)
+                               ext4_orphan_del(NULL, inode);
+
                        goto out;
                }
                if (inode->i_nlink)
@@ -3463,75 +3540,63 @@ out:
        return ret;
 }
 
-static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-       handle_t *handle = NULL;
+       handle_t *handle = ext4_journal_current_handle();
        int ret = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
+       int started = 0;
 
-       ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+       ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
                   inode->i_ino, create);
        /*
-        * DIO VFS code passes create = 0 flag for write to
-        * the middle of file. It does this to avoid block
-        * allocation for holes, to prevent expose stale data
-        * out when there is parallel buffered read (which does
-        * not hold the i_mutex lock) while direct IO write has
-        * not completed. DIO request on holes finally falls back
-        * to buffered IO for this reason.
-        *
-        * For ext4 extent based file, since we support fallocate,
-        * new allocated extent as uninitialized, for holes, we
-        * could fallocate blocks for holes, thus parallel
-        * buffered IO read will zero out the page when read on
-        * a hole while parallel DIO write to the hole has not completed.
-        *
-        * when we come here, we know it's a direct IO write to
-        * to the middle of file (<i_size)
-        * so it's safe to override the create flag from VFS.
+        * ext4_get_block in prepare for a DIO write or buffer write.
+        * We allocate an uinitialized extent if blocks haven't been allocated.
+        * The extent will be converted to initialized after IO complete.
         */
-       create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+       create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
 
-       if (max_blocks > DIO_MAX_BLOCKS)
-               max_blocks = DIO_MAX_BLOCKS;
-       dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-       handle = ext4_journal_start(inode, dio_credits);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto out;
+       if (!handle) {
+               if (max_blocks > DIO_MAX_BLOCKS)
+                       max_blocks = DIO_MAX_BLOCKS;
+               dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+               handle = ext4_journal_start(inode, dio_credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               started = 1;
        }
+
        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
                              create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
        }
-       ext4_journal_stop(handle);
+       if (started)
+               ext4_journal_stop(handle);
 out:
        return ret;
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-       BUG_ON(!io);
-       iput(io->inode);
-       kfree(io);
-}
-static void dump_aio_dio_list(struct inode * inode)
+static void dump_completed_IO(struct inode * inode)
 {
 #ifdef EXT4_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io, *io0, *io1;
+       unsigned long flags;
 
-       if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
-               ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
                return;
        }
 
-       ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
-       list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
                cur = &io->list;
                before = cur->prev;
                io0 = container_of(before, ext4_io_end_t, list);
@@ -3541,32 +3606,31 @@ static void dump_aio_dio_list(struct inode * inode)
                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
                            io, inode->i_ino, io0, io1);
        }
+       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
 #endif
 }
 
 /*
  * check a range of space and convert unwritten extents to written.
  */
-static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
-       size_t size = io->size;
+       ssize_t size = io->size;
        int ret = 0;
 
-       ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
 
        if (list_empty(&io->list))
                return ret;
 
-       if (io->flag != DIO_AIO_UNWRITTEN)
+       if (io->flag != EXT4_IO_UNWRITTEN)
                return ret;
 
-       if (offset + size <= i_size_read(inode))
-               ret = ext4_convert_unwritten_extents(inode, offset, size);
-
+       ret = ext4_convert_unwritten_extents(inode, offset, size);
        if (ret < 0) {
                printk(KERN_EMERG "%s: failed to convert unwritten"
                        "extents to written extents, error is %d"
@@ -3579,50 +3643,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
        io->flag = 0;
        return ret;
 }
+
 /*
  * work on completed aio dio IO, to convert unwritten extents to extents
  */
-static void ext4_end_aio_dio_work(struct work_struct *work)
+static void ext4_end_io_work(struct work_struct *work)
 {
-       ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
-       struct inode *inode = io->inode;
-       int ret = 0;
+       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
+       struct inode            *inode = io->inode;
+       struct ext4_inode_info  *ei = EXT4_I(inode);
+       unsigned long           flags;
+       int                     ret;
 
        mutex_lock(&inode->i_mutex);
-       ret = ext4_end_aio_dio_nolock(io);
-       if (ret >= 0) {
-               if (!list_empty(&io->list))
-                       list_del_init(&io->list);
-               ext4_free_io_end(io);
+       ret = ext4_end_io_nolock(io);
+       if (ret < 0) {
+               mutex_unlock(&inode->i_mutex);
+               return;
        }
+
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       if (!list_empty(&io->list))
+               list_del_init(&io->list);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        mutex_unlock(&inode->i_mutex);
+       ext4_free_io_end(io);
 }
+
 /*
  * This function is called from ext4_sync_file().
  *
- * When AIO DIO IO is completed, the work to convert unwritten
- * extents to written is queued on workqueue but may not get immediately
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
  * scheduled. When fsync is called, we need to ensure the
  * conversion is complete before fsync returns.
- * The inode keeps track of a list of completed AIO from DIO path
- * that might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents to written.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
  */
-int flush_aio_dio_completed_IO(struct inode *inode)
+int flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned long flags;
        int ret = 0;
        int ret2 = 0;
 
-       if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+       if (list_empty(&ei->i_completed_io_list))
                return ret;
 
-       dump_aio_dio_list(inode);
-       while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
-               io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+       dump_completed_IO(inode);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       while (!list_empty(&ei->i_completed_io_list)){
+               io = list_entry(ei->i_completed_io_list.next,
                                ext4_io_end_t, list);
                /*
-                * Calling ext4_end_aio_dio_nolock() to convert completed
+                * Calling ext4_end_io_nolock() to convert completed
                 * IO to written.
                 *
                 * When ext4_sync_file() is called, run_queue() may already
@@ -3635,20 +3713,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
                 * avoid double converting from both fsync and background work
                 * queue work.
                 */
-               ret = ext4_end_aio_dio_nolock(io);
+               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+               ret = ext4_end_io_nolock(io);
+               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
                if (ret < 0)
                        ret2 = ret;
                else
                        list_del_init(&io->list);
        }
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        return (ret2 < 0) ? ret2 : 0;
 }
 
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 {
        ext4_io_end_t *io = NULL;
 
-       io = kmalloc(sizeof(*io), GFP_NOFS);
+       io = kmalloc(sizeof(*io), flags);
 
        if (io) {
                igrab(inode);
@@ -3656,8 +3737,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
                io->flag = 0;
                io->offset = 0;
                io->size = 0;
-               io->error = 0;
-               INIT_WORK(&io->work, ext4_end_aio_dio_work);
+               io->page = NULL;
+               INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
 
@@ -3669,6 +3750,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
         ext4_io_end_t *io_end = iocb->private;
        struct workqueue_struct *wq;
+       unsigned long flags;
+       struct ext4_inode_info *ei;
 
        /* if not async direct IO or dio with 0 bytes write, just return */
        if (!io_end || !size)
@@ -3680,7 +3763,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
 
        /* if not aio dio with unwritten extents, just free io and return */
-       if (io_end->flag != DIO_AIO_UNWRITTEN){
+       if (io_end->flag != EXT4_IO_UNWRITTEN){
                ext4_free_io_end(io_end);
                iocb->private = NULL;
                return;
@@ -3688,16 +3771,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
        io_end->offset = offset;
        io_end->size = size;
+       io_end->flag = EXT4_IO_UNWRITTEN;
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
        /* queue the work to convert unwritten extents to written */
        queue_work(wq, &io_end->work);
 
        /* Add the io_end to per-inode completed aio dio list*/
-       list_add_tail(&io_end->list,
-                &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+       ei = EXT4_I(io_end->inode);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       list_add_tail(&io_end->list, &ei->i_completed_io_list);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        iocb->private = NULL;
 }
+
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+       ext4_io_end_t *io_end = bh->b_private;
+       struct workqueue_struct *wq;
+       struct inode *inode;
+       unsigned long flags;
+
+       if (!test_clear_buffer_uninit(bh) || !io_end)
+               goto out;
+
+       if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+               printk("sb umounted, discard end_io request for inode %lu\n",
+                       io_end->inode->i_ino);
+               ext4_free_io_end(io_end);
+               goto out;
+       }
+
+       io_end->flag = EXT4_IO_UNWRITTEN;
+       inode = io_end->inode;
+
+       /* Add the io_end to per-inode completed io list*/
+       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+       list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+       wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+       /* queue the work to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
+out:
+       bh->b_private = NULL;
+       bh->b_end_io = NULL;
+       clear_buffer_uninit(bh);
+       end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+       ext4_io_end_t *io_end;
+       struct page *page = bh->b_page;
+       loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+       size_t size = bh->b_size;
+
+retry:
+       io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+       if (!io_end) {
+               if (printk_ratelimit())
+                       printk(KERN_WARNING "%s: allocation fail\n", __func__);
+               schedule();
+               goto retry;
+       }
+       io_end->offset = offset;
+       io_end->size = size;
+       /*
+        * We need to hold a reference to the page to make sure it
+        * doesn't get evicted before ext4_end_io_work() has a chance
+        * to convert the extent from written to unwritten.
+        */
+       io_end->page = page;
+       get_page(io_end->page);
+
+       bh->b_private = io_end;
+       bh->b_end_io = ext4_end_io_buffer_write;
+       return 0;
+}
+
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
@@ -3751,7 +3903,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                iocb->private = NULL;
                EXT4_I(inode)->cur_aio_dio = NULL;
                if (!is_sync_kiocb(iocb)) {
-                       iocb->private = ext4_init_io_end(inode);
+                       iocb->private = ext4_init_io_end(inode, GFP_NOFS);
                        if (!iocb->private)
                                return -ENOMEM;
                        /*
@@ -3767,7 +3919,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                ret = blockdev_direct_IO(rw, iocb, inode,
                                         inode->i_sb->s_bdev, iov,
                                         offset, nr_segs,
-                                        ext4_get_block_dio_write,
+                                        ext4_get_block_write,
                                         ext4_end_io_dio);
                if (iocb->private)
                        EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3788,8 +3940,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
                        ext4_free_io_end(iocb->private);
                        iocb->private = NULL;
-               } else if (ret > 0 && (EXT4_I(inode)->i_state &
-                                      EXT4_STATE_DIO_UNWRITTEN)) {
+               } else if (ret > 0 && ext4_test_inode_state(inode,
+                                               EXT4_STATE_DIO_UNWRITTEN)) {
                        int err;
                        /*
                         * for non AIO case, since the IO is already
@@ -3799,7 +3951,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                                             offset, ret);
                        if (err < 0)
                                ret = err;
-                       EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+                       ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                }
                return ret;
        }
@@ -4130,18 +4282,27 @@ no_top:
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  */
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-                             struct buffer_head *bh,
-                             ext4_fsblk_t block_to_free,
-                             unsigned long count, __le32 *first,
-                             __le32 *last)
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh,
+                            ext4_fsblk_t block_to_free,
+                            unsigned long count, __le32 *first,
+                            __le32 *last)
 {
        __le32 *p;
-       int     flags = EXT4_FREE_BLOCKS_FORGET;
+       int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
 
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
 
+       if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+                                  count)) {
+               ext4_error(inode->i_sb, "inode #%lu: "
+                          "attempt to clear blocks %llu len %lu, invalid",
+                          inode->i_ino, (unsigned long long) block_to_free,
+                          count);
+               return 1;
+       }
+
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4321,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                *p = 0;
 
        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+       return 0;
 }
 
 /**
@@ -4215,9 +4377,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                               ext4_clear_blocks(handle, inode, this_bh,
-                                                 block_to_free,
-                                                 count, block_to_free_p, p);
+                               if (ext4_clear_blocks(handle, inode, this_bh,
+                                                     block_to_free, count,
+                                                     block_to_free_p, p))
+                                       break;
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
@@ -4241,7 +4404,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                       ext4_error(inode->i_sb, __func__,
+                       ext4_error(inode->i_sb,
                                   "circular indirect block detected, "
                                   "inode=%lu, block=%llu",
                                   inode->i_ino,
@@ -4281,6 +4444,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!nr)
                                continue;               /* A hole */
 
+                       if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+                                                  nr, 1)) {
+                               ext4_error(inode->i_sb,
+                                          "indirect mapped block in inode "
+                                          "#%lu invalid (level %d, blk #%lu)",
+                                          inode->i_ino, depth,
+                                          (unsigned long) nr);
+                               break;
+                       }
+
                        /* Go read the buffer for the next level down */
                        bh = sb_bread(inode->i_sb, nr);
 
@@ -4289,7 +4462,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                               ext4_error(inode->i_sb, "ext4_free_branches",
+                               ext4_error(inode->i_sb,
                                           "Read failure, inode=%lu, block=%llu",
                                           inode->i_ino, nr);
                                continue;
@@ -4433,8 +4606,10 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
 
+       EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-               ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+               ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                ext4_ext_truncate(inode);
@@ -4604,9 +4779,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
        bh = sb_getblk(sb, block);
        if (!bh) {
-               ext4_error(sb, "ext4_get_inode_loc", "unable to read "
-                          "inode block - inode=%lu, block=%llu",
-                          inode->i_ino, block);
+               ext4_error(sb, "unable to read inode block - "
+                          "inode=%lu, block=%llu", inode->i_ino, block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4704,9 +4878,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                       ext4_error(sb, __func__,
-                                  "unable to read inode block - inode=%lu, "
-                                  "block=%llu", inode->i_ino, block);
+                       ext4_error(sb, "unable to read inode block - inode=%lu,"
+                                  " block=%llu", inode->i_ino, block);
                        brelse(bh);
                        return -EIO;
                }
@@ -4720,7 +4893,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
        /* We have all inode data except xattrs in memory here. */
        return __ext4_get_inode_loc(inode, iloc,
-               !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+               !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 
 void ext4_set_inode_flags(struct inode *inode)
@@ -4814,7 +4987,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 
-       ei->i_state = 0;
+       ei->i_state_flags = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +5070,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                        EXT4_GOOD_OLD_INODE_SIZE +
                                        ei->i_extra_isize;
                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                               ei->i_state |= EXT4_STATE_XATTR;
+                               ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                }
        } else
                ei->i_extra_isize = 0;
@@ -4917,8 +5090,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-               ext4_error(sb, __func__,
-                          "bad extended attribute block %llu in inode #%lu",
+               ext4_error(sb, "bad extended attribute block %llu inode #%lu",
                           ei->i_file_acl, inode->i_ino);
                ret = -EIO;
                goto bad_inode;
@@ -4964,8 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
                ret = -EIO;
-               ext4_error(inode->i_sb, __func__,
-                          "bogus i_mode (%o) for inode=%lu",
+               ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
@@ -5037,7 +5208,7 @@ static int ext4_do_update_inode(handle_t *handle,
 
        /* For fields not not tracking in the in-memory inode,
         * initialise them to zero for new inodes. */
-       if (ei->i_state & EXT4_STATE_NEW)
+       if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
        ext4_get_inode_flags(ei);
@@ -5101,7 +5272,7 @@ static int ext4_do_update_inode(handle_t *handle,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        sb->s_dirt = 1;
                        ext4_handle_sync(handle);
-                       err = ext4_handle_dirty_metadata(handle, inode,
+                       err = ext4_handle_dirty_metadata(handle, NULL,
                                        EXT4_SB(sb)->s_sbh);
                }
        }
@@ -5130,10 +5301,10 @@ static int ext4_do_update_inode(handle_t *handle,
        }
 
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       rc = ext4_handle_dirty_metadata(handle, inode, bh);
+       rc = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (!err)
                err = rc;
-       ei->i_state &= ~EXT4_STATE_NEW;
+       ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 
        ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
@@ -5204,10 +5375,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                       ext4_error(inode->i_sb, __func__,
-                                  "IO error syncing inode, "
-                                  "inode=%lu, block=%llu",
-                                  inode->i_ino,
+                       ext4_error(inode->i_sb, "IO error syncing inode, "
+                                  "inode=%lu, block=%llu", inode->i_ino,
                                   (unsigned long long)iloc.bh->b_blocknr);
                        err = -EIO;
                }
@@ -5288,7 +5457,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
 
        if (S_ISREG(inode->i_mode) &&
-           attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+           attr->ia_valid & ATTR_SIZE &&
+           (attr->ia_size < inode->i_size ||
+            (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
                handle_t *handle;
 
                handle = ext4_journal_start(inode, 3);
@@ -5319,6 +5490,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                goto err_out;
                        }
                }
+               /* ext4_truncate will clear the flag */
+               if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+                       ext4_truncate(inode);
        }
 
        rc = inode_setattr(inode, attr);
@@ -5557,8 +5731,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
        entry = IFIRST(header);
 
        /* No extended attributes present */
-       if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
-               header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+           header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
                        new_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5776,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-           !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+           !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                /*
                 * We need extra buffer credits since we may write into EA block
                 * with this same handle. If journal_extend fails, then it will
@@ -5616,10 +5790,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                                                      sbi->s_want_extra_isize,
                                                      iloc, handle);
                        if (ret) {
-                               EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+                               ext4_set_inode_state(inode,
+                                                    EXT4_STATE_NO_EXPAND);
                                if (mnt_count !=
                                        le16_to_cpu(sbi->s_es->s_mnt_count)) {
-                                       ext4_warning(inode->i_sb, __func__,
+                                       ext4_warning(inode->i_sb,
                                        "Unable to expand inode %lu. Delete"
                                        " some EAs or run e2fsck.",
                                        inode->i_ino);
@@ -5683,7 +5858,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
                        err = jbd2_journal_get_write_access(handle, iloc.bh);
                        if (!err)
                                err = ext4_handle_dirty_metadata(handle,
-                                                                inode,
+                                                                NULL,
                                                                 iloc.bh);
                        brelse(iloc.bh);
                }
index b63d193..016d024 100644 (file)
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        flags &= ~EXT4_EXTENTS_FL;
                }
 
+               if (flags & EXT4_EOFBLOCKS_FL) {
+                       /* we don't support adding EOFBLOCKS flag */
+                       if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+                               err = -EOPNOTSUPP;
+                               goto flags_out;
+                       }
+               } else if (oldflags & EXT4_EOFBLOCKS_FL)
+                       ext4_truncate(inode);
+
                handle = ext4_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
                if (me.moved_len > 0)
                        file_remove_suid(donor_filp);
 
-               if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+               if (copy_to_user((struct move_extent __user *)arg, 
+                                &me, sizeof(me)))
                        err = -EFAULT;
 mext_out:
                fput(donor_filp);
index d34afad..abb11e3 100644 (file)
@@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;
-                       blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+                       blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += first + i;
-                       blocknr +=
-                           le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                   __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 
                if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
                        ext4_fsblk_t blocknr;
-                       blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+                       blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += block;
-                       blocknr +=
-                           le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                   __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
        int max;
        int err;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-       struct ext4_super_block *es = sbi->s_es;
        struct ext4_free_extent ex;
 
        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                ext4_fsblk_t start;
 
-               start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
-                       ex.fe_start + le32_to_cpu(es->s_first_data_block);
+               start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+                       ex.fe_start;
                /* use do_div to get remainder (would be 64-bit modulo) */
                if (do_div(start, sbi->s_stripe) == 0) {
                        ac->ac_found++;
@@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        BUG_ON(sbi->s_stripe == 0);
 
        /* find first stripe-aligned block in group */
-       first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
-               + le32_to_cpu(sbi->s_es->s_first_data_block);
+       first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
        a = first_group_block + sbi->s_stripe - 1;
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
-       meta_group_info[i]->bb_free_root.rb_node = NULL;
+       meta_group_info[i]->bb_free_root = RB_ROOT;
 
 #ifdef DOUBLE_CHECK
        {
@@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                ext4_unlock_group(sb, entry->group);
                if (test_opt(sb, DISCARD)) {
                        ext4_fsblk_t discard_block;
-                       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
-                       discard_block = (ext4_fsblk_t)entry->group *
-                                               EXT4_BLOCKS_PER_GROUP(sb)
-                                       + entry->start_blk
-                                       + le32_to_cpu(es->s_first_data_block);
+                       discard_block = entry->start_blk +
+                               ext4_group_first_block_no(sb, entry->group);
                        trace_ext4_discard_blocks(sb,
                                        (unsigned long long)discard_block,
                                        entry->count);
@@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (err)
                goto out_err;
 
-       block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-               + ac->ac_b_ex.fe_start
-               + le32_to_cpu(es->s_first_data_block);
+       block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
        len = ac->ac_b_ex.fe_len;
        if (!ext4_data_block_valid(sbi, block, len)) {
-               ext4_error(sb, __func__,
-                          "Allocating blocks %llu-%llu which overlap "
+               ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
@@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
 
-       goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
-                    ac->ac_g_ex.fe_start +
-                    le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+       goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
        /*
         * search for the prealloc space that is having
         * minimal distance from the goal block.
@@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-               start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
-                               le32_to_cpu(sbi->s_es->s_first_data_block);
+               start = ext4_group_first_block_no(sb, group) + bit;
                mb_debug(1, "    free preallocated %u/%u in group %u\n",
                                (unsigned) start, (unsigned) next - bit,
                                (unsigned) group);
@@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
-               ext4_error(sb, __func__, "Error in reading block "
-                               "bitmap for %u", group);
+               ext4_error(sb, "Error reading block bitmap for %u", group);
                return 0;
        }
 
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
-               ext4_error(sb, __func__, "Error in loading buddy "
-                               "information for %u", group);
+               ext4_error(sb, "Error loading buddy information for %u", group);
                put_bh(bitmap_bh);
                return 0;
        }
@@ -3804,15 +3790,15 @@ repeat:
 
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
-                       ext4_error(sb, __func__, "Error in loading buddy "
-                                       "information for %u", group);
+                       ext4_error(sb, "Error loading buddy information for %u",
+                                       group);
                        continue;
                }
 
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
-                       ext4_error(sb, __func__, "Error in reading block "
-                                       "bitmap for %u", group);
+                       ext4_error(sb, "Error reading block bitmap for %u",
+                                       group);
                        ext4_mb_release_desc(&e4b);
                        continue;
                }
@@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 
        /* don't use group allocation for large files */
        size = max(size, isize);
-       if (size >= sbi->s_mb_stream_request) {
+       if (size > sbi->s_mb_stream_request) {
                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                return;
        }
@@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                if (ext4_mb_load_buddy(sb, group, &e4b)) {
-                       ext4_error(sb, __func__, "Error in loading buddy "
-                                       "information for %u", group);
+                       ext4_error(sb, "Error loading buddy information for %u",
+                                       group);
                        continue;
                }
                ext4_lock_group(sb, group);
@@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
-       if (!ext4_data_block_valid(sbi, block, count)) {
-               ext4_error(sb, __func__,
-                           "Freeing blocks not in datazone - "
-                           "block = %llu, count = %lu", block, count);
+       if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+           !ext4_data_block_valid(sbi, block, count)) {
+               ext4_error(sb, "Freeing blocks not in datazone - "
+                          "block = %llu, count = %lu", block, count);
                goto error_return;
        }
 
@@ -4547,8 +4533,7 @@ do_more:
            in_range(block + count - 1, ext4_inode_table(sb, gdp),
                      EXT4_SB(sb)->s_itb_per_group)) {
 
-               ext4_error(sb, __func__,
-                          "Freeing blocks in system zone - "
+               ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_return;
index 436521c..b619322 100644 (file)
@@ -220,16 +220,9 @@ struct ext4_buddy {
 #define EXT4_MB_BITMAP(e4b)    ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)     ((e4b)->bd_buddy)
 
-#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
-
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
-       ext4_fsblk_t block;
-
-       block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-                       + fex->fe_start
-                       + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-       return block;
+       return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
 }
 #endif
index 8141581..8b87bd0 100644 (file)
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * happened after we started the migrate. We need to
         * fail the migrate
         */
-       if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+       if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
                retval = -EAGAIN;
                up_write(&EXT4_I(inode)->i_data_sem);
                goto err_out;
        } else
-               EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+               ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
        /*
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
        }
        i_size_write(tmp_inode, i_size_read(inode));
        /*
-        * We don't want the inode to be reclaimed
-        * if we got interrupted in between. We have
-        * this tmp inode carrying reference to the
-        * data blocks of the original file. We set
-        * the i_nlink to zero at the last stage after
-        * switching the original file to extent format
+        * Set the i_nlink to zero so it will be deleted later
+        * when we drop inode reference.
         */
-       tmp_inode->i_nlink = 1;
+       tmp_inode->i_nlink = 0;
 
        ext4_ext_tree_init(handle, tmp_inode);
        ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode)
         * allocation.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-       EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+       ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
        up_read((&EXT4_I(inode)->i_data_sem));
 
        handle = ext4_journal_start(inode, 1);
+       if (IS_ERR(handle)) {
+               /*
+                * It is impossible to update on-disk structures without
+                * a handle, so just rollback in-core changes and live other
+                * work to orphan_list_cleanup()
+                */
+               ext4_orphan_del(NULL, tmp_inode);
+               retval = PTR_ERR(handle);
+               goto out;
+       }
 
        ei = EXT4_I(inode);
        i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:
 
        /* Reset the extent details */
        ext4_ext_tree_init(handle, tmp_inode);
-
-       /*
-        * Set the i_nlink to zero so that
-        * generic_drop_inode really deletes the
-        * inode
-        */
-       tmp_inode->i_nlink = 0;
-
        ext4_journal_stop(handle);
+out:
        unlock_new_inode(tmp_inode);
        iput(tmp_inode);
 
index 82c415b..aa5fe28 100644 (file)
@@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
        int ret = 0;
 
        if (inode1 == NULL) {
-               ext4_error(inode2->i_sb, function,
+               __ext4_error(inode2->i_sb, function,
                        "Both inodes should not be NULL: "
                        "inode1 NULL inode2 %lu", inode2->i_ino);
                ret = -EIO;
        } else if (inode2 == NULL) {
-               ext4_error(inode1->i_sb, function,
+               __ext4_error(inode1->i_sb, function,
                        "Both inodes should not be NULL: "
                        "inode1 %lu inode2 NULL", inode1->i_ino);
                ret = -EIO;
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                }
 
                o_start->ee_len = start_ext->ee_len;
+               eblock = le32_to_cpu(start_ext->ee_block);
                new_flag = 1;
 
        } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 * orig  |------------------------------|
                 */
                o_start->ee_len = start_ext->ee_len;
+               eblock = le32_to_cpu(start_ext->ee_block);
                new_flag = 1;
 
        } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
        struct ext4_extent new_ext, start_ext, end_ext;
        ext4_lblk_t new_ext_end;
-       ext4_fsblk_t new_phys_end;
        int oext_alen, new_ext_alen, end_ext_alen;
        int depth = ext_depth(orig_inode);
        int ret;
@@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-       new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
 
        /*
         * Case: original extent is first
@@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                le32_to_cpu(oext->ee_block) + oext_alen) {
                start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
                                               le32_to_cpu(oext->ee_block));
+               start_ext.ee_block = oext->ee_block;
                copy_extent_status(oext, &start_ext);
        } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
                prev_ext = oext - 1;
@@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                        start_ext.ee_len = cpu_to_le16(
                                ext4_ext_get_actual_len(prev_ext) +
                                new_ext_alen);
+                       start_ext.ee_block = oext->ee_block;
                        copy_extent_status(prev_ext, &start_ext);
                        new_ext.ee_len = 0;
                }
@@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * new_ext       |-------|
         */
        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-               ext4_error(orig_inode->i_sb, __func__,
+               ext4_error(orig_inode->i_sb,
                        "new_ext_end(%u) should be less than or equal to "
                        "oext->ee_block(%u) + oext_alen(%d) - 1",
                        new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        while (1) {
                /* The extent for donor must be found. */
                if (!dext) {
-                       ext4_error(donor_inode->i_sb, __func__,
+                       ext4_error(donor_inode->i_sb,
                                   "The extent for donor must be found");
                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                       ext4_error(donor_inode->i_sb, __func__,
+                       ext4_error(donor_inode->i_sb,
                                "Donor offset(%u) and the first block of donor "
                                "extent(%u) should be equal",
                                donor_off,
@@ -928,7 +930,7 @@ out2:
 }
 
 /**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
  *
  * @orig_inode:                original inode
  * @donor_inode:       donor inode
@@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode,
        unsigned int blkbits = orig_inode->i_blkbits;
        unsigned int blocksize = 1 << blkbits;
 
-       /* Regular file check */
-       if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
-               ext4_debug("ext4 move extent: The argument files should be "
-                       "regular file [ino:orig %lu, donor %lu]\n",
-                       orig_inode->i_ino, donor_inode->i_ino);
-               return -EINVAL;
-       }
-
        if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
                ext4_debug("ext4 move extent: suid or sgid is set"
                           " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
 
+       /* Regular file check */
+       if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+               ext4_debug("ext4 move extent: The argument files should be "
+                       "regular file [ino:orig %lu, donor %lu]\n",
+                       orig_inode->i_ino, donor_inode->i_ino);
+               return -EINVAL;
+       }
+
        /* Protect orig and donor inodes against a truncate */
        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
        if (ret1 < 0)
@@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                        if (ret1 < 0)
                                break;
                        if (*moved_len > len) {
-                               ext4_error(orig_inode->i_sb, __func__,
+                               ext4_error(orig_inode->i_sb,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
index 17a17e1..608d21f 100644 (file)
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        if (root->info.hash_version != DX_HASH_TEA &&
            root->info.hash_version != DX_HASH_HALF_MD4 &&
            root->info.hash_version != DX_HASH_LEGACY) {
-               ext4_warning(dir->i_sb, __func__,
-                            "Unrecognised inode hash code %d",
+               ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
                             root->info.hash_version);
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        hash = hinfo->hash;
 
        if (root->info.unused_flags & 1) {
-               ext4_warning(dir->i_sb, __func__,
-                            "Unimplemented inode hash flags: %#06x",
+               ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
                             root->info.unused_flags);
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        }
 
        if ((indirect = root->info.indirect_levels) > 1) {
-               ext4_warning(dir->i_sb, __func__,
-                            "Unimplemented inode hash depth: %#06x",
+               ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
                             root->info.indirect_levels);
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 
        if (dx_get_limit(entries) != dx_root_limit(dir,
                                                   root->info.info_length)) {
-               ext4_warning(dir->i_sb, __func__,
-                            "dx entry: limit != root limit");
+               ext4_warning(dir->i_sb, "dx entry: limit != root limit");
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
                goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        {
                count = dx_get_count(entries);
                if (!count || count > dx_get_limit(entries)) {
-                       ext4_warning(dir->i_sb, __func__,
+                       ext4_warning(dir->i_sb,
                                     "dx entry: no count or count > limit");
                        brelse(bh);
                        *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                        goto fail2;
                at = entries = ((struct dx_node *) bh->b_data)->entries;
                if (dx_get_limit(entries) != dx_node_limit (dir)) {
-                       ext4_warning(dir->i_sb, __func__,
+                       ext4_warning(dir->i_sb,
                                     "dx entry: limit != node limit");
                        brelse(bh);
                        *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
        }
 fail:
        if (*err == ERR_BAD_DX_DIR)
-               ext4_warning(dir->i_sb, __func__,
+               ext4_warning(dir->i_sb,
                             "Corrupt dir inode %ld, running e2fsck is "
                             "recommended.", dir->i_ino);
        return NULL;
@@ -947,9 +943,8 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
-                       ext4_error(sb, __func__, "reading directory #%lu "
-                                  "offset %lu", dir->i_ino,
-                                  (unsigned long)block);
+                       ext4_error(sb, "reading directory #%lu offset %lu",
+                                  dir->i_ino, (unsigned long)block);
                        brelse(bh);
                        goto next;
                }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                retval = ext4_htree_next_block(dir, hash, frame,
                                               frames, NULL);
                if (retval < 0) {
-                       ext4_warning(sb, __func__,
+                       ext4_warning(sb,
                             "error reading index page in directory #%lu",
                             dir->i_ino);
                        *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
-                       ext4_error(dir->i_sb, "ext4_lookup",
-                                  "bad inode number: %u", ino);
+                       ext4_error(dir->i_sb, "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
                if (unlikely(IS_ERR(inode))) {
                        if (PTR_ERR(inode) == -ESTALE) {
-                               ext4_error(dir->i_sb, __func__,
+                               ext4_error(dir->i_sb,
                                                "deleted inode referenced: %u",
                                                ino);
                                return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
        brelse(bh);
 
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-               ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+               ext4_error(child->d_inode->i_sb,
                           "bad inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
-               ext4_error(dir->i_sb, __func__,
+               ext4_error(dir->i_sb,
                           "invalid rec_len for '..' in inode %lu",
                           dir->i_ino);
                brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 
                if (levels && (dx_get_count(frames->entries) ==
                               dx_get_limit(frames->entries))) {
-                       ext4_warning(sb, __func__,
-                                    "Directory index full!");
+                       ext4_warning(sb, "Directory index full!");
                        err = -ENOSPC;
                        goto cleanup;
                }
@@ -1916,11 +1909,11 @@ static int empty_dir(struct inode *inode)
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
-                       ext4_error(inode->i_sb, __func__,
+                       ext4_error(inode->i_sb,
                                   "error %d reading directory #%lu offset 0",
                                   err, inode->i_ino);
                else
-                       ext4_warning(inode->i_sb, __func__,
+                       ext4_warning(inode->i_sb,
                                     "bad directory (dir #%lu) - no data block",
                                     inode->i_ino);
                return 1;
@@ -1931,7 +1924,7 @@ static int empty_dir(struct inode *inode)
                        !le32_to_cpu(de1->inode) ||
                        strcmp(".", de->name) ||
                        strcmp("..", de1->name)) {
-               ext4_warning(inode->i_sb, "empty_dir",
+               ext4_warning(inode->i_sb,
                             "bad directory (dir #%lu) - no `.' or `..'",
                             inode->i_ino);
                brelse(bh);
@@ -1949,7 +1942,7 @@ static int empty_dir(struct inode *inode)
                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
                        if (!bh) {
                                if (err)
-                                       ext4_error(sb, __func__,
+                                       ext4_error(sb,
                                                   "error %d reading directory"
                                                   " #%lu offset %u",
                                                   err, inode->i_ino, offset);
@@ -2020,11 +2013,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out_unlock;
+       /*
+        * Due to previous errors inode may be already a part of on-disk
+        * orphan list. If so skip on-disk list modification.
+        */
+       if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+               (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+                       goto mem_insert;
 
        /* Insert this inode at the head of the on-disk orphan list... */
        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-       err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+       err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
@@ -2037,6 +2037,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         *
         * This is safe: on error we're going to ignore the orphan list
         * anyway on the next recovery. */
+mem_insert:
        if (!err)
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 
@@ -2096,7 +2097,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                if (err)
                        goto out_brelse;
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-               err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+               err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
@@ -2163,7 +2164,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
        if (retval)
                goto end_rmdir;
        if (!EXT4_DIR_LINK_EMPTY(inode))
-               ext4_warning(inode->i_sb, "ext4_rmdir",
+               ext4_warning(inode->i_sb,
                             "empty directory has too many links (%d)",
                             inode->i_nlink);
        inode->i_version++;
@@ -2215,7 +2216,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                goto end_unlink;
 
        if (!inode->i_nlink) {
-               ext4_warning(inode->i_sb, "ext4_unlink",
+               ext4_warning(inode->i_sb,
                             "Deleting nonexistent file (%lu), %d",
                             inode->i_ino, inode->i_nlink);
                inode->i_nlink = 1;
@@ -2462,7 +2463,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
        if (retval) {
-               ext4_warning(old_dir->i_sb, "ext4_rename",
+               ext4_warning(old_dir->i_sb,
                                "Deleting old file (%lu), %d, error=%d",
                                old_dir->i_ino, old_dir->i_nlink, retval);
        }
index 3b2c554..5692c48 100644 (file)
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
 
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
        if (group != sbi->s_groups_count)
-               ext4_warning(sb, __func__,
-                            "Cannot add at group %u (only %u groups)",
+               ext4_warning(sb, "Cannot add at group %u (only %u groups)",
                             input->group, sbi->s_groups_count);
        else if (offset != 0)
-                       ext4_warning(sb, __func__, "Last group not full");
+                       ext4_warning(sb, "Last group not full");
        else if (input->reserved_blocks > input->blocks_count / 5)
-               ext4_warning(sb, __func__, "Reserved blocks too high (%u)",
+               ext4_warning(sb, "Reserved blocks too high (%u)",
                             input->reserved_blocks);
        else if (free_blocks_count < 0)
-               ext4_warning(sb, __func__, "Bad blocks count %u",
+               ext4_warning(sb, "Bad blocks count %u",
                             input->blocks_count);
        else if (!(bh = sb_bread(sb, end - 1)))
-               ext4_warning(sb, __func__,
-                            "Cannot read last block (%llu)",
+               ext4_warning(sb, "Cannot read last block (%llu)",
                             end - 1);
        else if (outside(input->block_bitmap, start, end))
-               ext4_warning(sb, __func__,
-                            "Block bitmap not in group (block %llu)",
+               ext4_warning(sb, "Block bitmap not in group (block %llu)",
                             (unsigned long long)input->block_bitmap);
        else if (outside(input->inode_bitmap, start, end))
-               ext4_warning(sb, __func__,
-                            "Inode bitmap not in group (block %llu)",
+               ext4_warning(sb, "Inode bitmap not in group (block %llu)",
                             (unsigned long long)input->inode_bitmap);
        else if (outside(input->inode_table, start, end) ||
                 outside(itend - 1, start, end))
-               ext4_warning(sb, __func__,
-                            "Inode table not in group (blocks %llu-%llu)",
+               ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
                             (unsigned long long)input->inode_table, itend - 1);
        else if (input->inode_bitmap == input->block_bitmap)
-               ext4_warning(sb, __func__,
-                            "Block bitmap same as inode bitmap (%llu)",
+               ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
                             (unsigned long long)input->block_bitmap);
        else if (inside(input->block_bitmap, input->inode_table, itend))
-               ext4_warning(sb, __func__,
-                            "Block bitmap (%llu) in inode table (%llu-%llu)",
+               ext4_warning(sb, "Block bitmap (%llu) in inode table "
+                            "(%llu-%llu)",
                             (unsigned long long)input->block_bitmap,
                             (unsigned long long)input->inode_table, itend - 1);
        else if (inside(input->inode_bitmap, input->inode_table, itend))
-               ext4_warning(sb, __func__,
-                            "Inode bitmap (%llu) in inode table (%llu-%llu)",
+               ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+                            "(%llu-%llu)",
                             (unsigned long long)input->inode_bitmap,
                             (unsigned long long)input->inode_table, itend - 1);
        else if (inside(input->block_bitmap, start, metaend))
-               ext4_warning(sb, __func__,
-                            "Block bitmap (%llu) in GDT table"
-                            " (%llu-%llu)",
+               ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
                             (unsigned long long)input->block_bitmap,
                             start, metaend - 1);
        else if (inside(input->inode_bitmap, start, metaend))
-               ext4_warning(sb, __func__,
-                            "Inode bitmap (%llu) in GDT table"
-                            " (%llu-%llu)",
+               ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
                             (unsigned long long)input->inode_bitmap,
                             start, metaend - 1);
        else if (inside(input->inode_table, start, metaend) ||
                 inside(itend - 1, start, metaend))
-               ext4_warning(sb, __func__,
-                            "Inode table (%llu-%llu) overlaps"
-                            "GDT table (%llu-%llu)",
+               ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+                            "(%llu-%llu)",
                             (unsigned long long)input->inode_table,
                             itend - 1, start, metaend - 1);
        else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
        while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
                if (le32_to_cpu(*p++) !=
                    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
-                       ext4_warning(sb, __func__,
-                                    "reserved GDT %llu"
+                       ext4_warning(sb, "reserved GDT %llu"
                                     " missing grp %d (%llu)",
                                     blk, grp,
                                     grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
          */
        if (EXT4_SB(sb)->s_sbh->b_blocknr !=
            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-               ext4_warning(sb, __func__,
-                       "won't resize using backup superblock at %llu",
+               ext4_warning(sb, "won't resize using backup superblock at %llu",
                        (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
                return -EPERM;
        }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
        data = (__le32 *)dind->b_data;
        if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
-               ext4_warning(sb, __func__,
-                            "new group %u GDT block %llu not reserved",
+               ext4_warning(sb, "new group %u GDT block %llu not reserved",
                             input->group, gdblock);
                err = -EINVAL;
                goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                        GFP_NOFS);
        if (!n_group_desc) {
                err = -ENOMEM;
-               ext4_warning(sb, __func__,
+               ext4_warning(sb,
                              "not enough memory for %lu groups", gdb_num + 1);
                goto exit_inode;
        }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        /* Get each reserved primary GDT block and verify it holds backups */
        for (res = 0; res < reserved_gdb; res++, blk++) {
                if (le32_to_cpu(*data) != blk) {
-                       ext4_warning(sb, __func__,
-                                    "reserved block %llu"
+                       ext4_warning(sb, "reserved block %llu"
                                     " not at offset %ld",
                                     blk,
                                     (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
         */
 exit_err:
        if (err) {
-               ext4_warning(sb, __func__,
-                            "can't update backup for group %u (err %d), "
+               ext4_warning(sb, "can't update backup for group %u (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT4_VALID_FS;
                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
        if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-               ext4_warning(sb, __func__,
-                            "Can't resize non-sparse filesystem further");
+               ext4_warning(sb, "Can't resize non-sparse filesystem further");
                return -EPERM;
        }
 
        if (ext4_blocks_count(es) + input->blocks_count <
            ext4_blocks_count(es)) {
-               ext4_warning(sb, __func__, "blocks_count overflow");
+               ext4_warning(sb, "blocks_count overflow");
                return -EINVAL;
        }
 
        if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
            le32_to_cpu(es->s_inodes_count)) {
-               ext4_warning(sb, __func__, "inodes_count overflow");
+               ext4_warning(sb, "inodes_count overflow");
                return -EINVAL;
        }
 
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                if (!EXT4_HAS_COMPAT_FEATURE(sb,
                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)
                    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
-                       ext4_warning(sb, __func__,
+                       ext4_warning(sb,
                                     "No reserved GDT blocks, can't resize");
                        return -EPERM;
                }
                inode = ext4_iget(sb, EXT4_RESIZE_INO);
                if (IS_ERR(inode)) {
-                       ext4_warning(sb, __func__,
-                                    "Error opening resize inode");
+                       ext4_warning(sb, "Error opening resize inode");
                        return PTR_ERR(inode);
                }
        }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
-               ext4_warning(sb, __func__,
-                            "multiple resizers run on filesystem!");
+               ext4_warning(sb, "multiple resizers run on filesystem!");
                err = -EBUSY;
                goto exit_journal;
        }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                       ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
+                       ext4_warning(sb, "CONFIG_LBDAF not enabled");
                return -EINVAL;
        }
 
        if (n_blocks_count < o_blocks_count) {
-               ext4_warning(sb, __func__,
-                            "can't shrink FS - resize aborted");
+               ext4_warning(sb, "can't shrink FS - resize aborted");
                return -EBUSY;
        }
 
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
        if (last == 0) {
-               ext4_warning(sb, __func__,
-                            "need to use ext2online to resize further");
+               ext4_warning(sb, "need to use ext2online to resize further");
                return -EPERM;
        }
 
        add = EXT4_BLOCKS_PER_GROUP(sb) - last;
 
        if (o_blocks_count + add < o_blocks_count) {
-               ext4_warning(sb, __func__, "blocks_count overflow");
+               ext4_warning(sb, "blocks_count overflow");
                return -EINVAL;
        }
 
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                add = n_blocks_count - o_blocks_count;
 
        if (o_blocks_count + add < n_blocks_count)
-               ext4_warning(sb, __func__,
-                            "will only finish group (%llu"
-                            " blocks, %u new)",
+               ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
                             o_blocks_count + add, add);
 
        /* See if the device is actually as big as what was requested */
        bh = sb_bread(sb, o_blocks_count + add - 1);
        if (!bh) {
-               ext4_warning(sb, __func__,
-                            "can't read last block, resize aborted");
+               ext4_warning(sb, "can't read last block, resize aborted");
                return -ENOSPC;
        }
        brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        handle = ext4_journal_start_sb(sb, 3);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
-               ext4_warning(sb, __func__, "error %d on journal start", err);
+               ext4_warning(sb, "error %d on journal start", err);
                goto exit_put;
        }
 
        mutex_lock(&EXT4_SB(sb)->s_resize_lock);
        if (o_blocks_count != ext4_blocks_count(es)) {
-               ext4_warning(sb, __func__,
-                            "multiple resizers run on filesystem!");
+               ext4_warning(sb, "multiple resizers run on filesystem!");
                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
        if ((err = ext4_journal_get_write_access(handle,
                                                 EXT4_SB(sb)->s_sbh))) {
-               ext4_warning(sb, __func__,
-                            "error %d on journal write access", err);
+               ext4_warning(sb, "error %d on journal write access", err);
                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
index 735c20d..ad1ee5f 100644 (file)
@@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb)
                        sb->s_id);
 }
 
-void ext4_error(struct super_block *sb, const char *function,
+void __ext4_error(struct super_block *sb, const char *function,
                const char *fmt, ...)
 {
        va_list args;
@@ -347,6 +347,42 @@ void ext4_error(struct super_block *sb, const char *function,
        ext4_handle_error(sb);
 }
 
+void ext4_error_inode(const char *function, struct inode *inode,
+                     const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+              inode->i_sb->s_id, function, inode->i_ino, current->comm);
+       vprintk(fmt, args);
+       printk("\n");
+       va_end(args);
+
+       ext4_handle_error(inode->i_sb);
+}
+
+void ext4_error_file(const char *function, struct file *file,
+                    const char *fmt, ...)
+{
+       va_list args;
+       struct inode *inode = file->f_dentry->d_inode;
+       char pathname[80], *path;
+
+       va_start(args, fmt);
+       path = d_path(&(file->f_path), pathname, sizeof(pathname));
+       if (!path)
+               path = "(unknown)";
+       printk(KERN_CRIT
+              "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+              inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+       vprintk(fmt, args);
+       printk("\n");
+       va_end(args);
+
+       ext4_handle_error(inode->i_sb);
+}
+
 static const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16])
 {
@@ -450,7 +486,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
        va_end(args);
 }
 
-void ext4_warning(struct super_block *sb, const char *function,
+void __ext4_warning(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
        va_list args;
@@ -507,7 +543,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
                return;
 
-       ext4_warning(sb, __func__,
+       ext4_warning(sb,
                     "updating to rev %d because of new feature flag, "
                     "running e2fsck is recommended",
                     EXT4_DYNAMIC_REV);
@@ -708,7 +744,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
-       INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+       INIT_LIST_HEAD(&ei->i_completed_io_list);
+       spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
@@ -796,10 +833,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
        if (sbi->s_qf_names[GRPQUOTA])
                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 
-       if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+       if (test_opt(sb, USRQUOTA))
                seq_puts(seq, ",usrquota");
 
-       if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+       if (test_opt(sb, GRPQUOTA))
                seq_puts(seq, ",grpquota");
 #endif
 }
@@ -926,6 +963,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NOLOAD))
                seq_puts(seq, ",norecovery");
 
+       if (test_opt(sb, DIOREAD_NOLOCK))
+               seq_puts(seq, ",dioread_nolock");
+
        ext4_show_quota_options(seq, sb);
 
        return 0;
@@ -1109,6 +1149,7 @@ enum {
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
+       Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
 };
 
@@ -1176,6 +1217,8 @@ static const match_table_t tokens = {
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc"},
        {Opt_noauto_da_alloc, "noauto_da_alloc"},
+       {Opt_dioread_nolock, "dioread_nolock"},
+       {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_err, NULL},
@@ -1205,6 +1248,66 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+       "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       char *qname;
+
+       if (sb_any_quota_loaded(sb) &&
+               !sbi->s_qf_names[qtype]) {
+               ext4_msg(sb, KERN_ERR,
+                       "Cannot change journaled "
+                       "quota options when quota turned on");
+               return 0;
+       }
+       qname = match_strdup(args);
+       if (!qname) {
+               ext4_msg(sb, KERN_ERR,
+                       "Not enough memory for storing quotafile name");
+               return 0;
+       }
+       if (sbi->s_qf_names[qtype] &&
+               strcmp(sbi->s_qf_names[qtype], qname)) {
+               ext4_msg(sb, KERN_ERR,
+                       "%s quota file already specified", QTYPE2NAME(qtype));
+               kfree(qname);
+               return 0;
+       }
+       sbi->s_qf_names[qtype] = qname;
+       if (strchr(sbi->s_qf_names[qtype], '/')) {
+               ext4_msg(sb, KERN_ERR,
+                       "quotafile must be on filesystem root");
+               kfree(sbi->s_qf_names[qtype]);
+               sbi->s_qf_names[qtype] = NULL;
+               return 0;
+       }
+       set_opt(sbi->s_mount_opt, QUOTA);
+       return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (sb_any_quota_loaded(sb) &&
+               sbi->s_qf_names[qtype]) {
+               ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+                       " when quota turned on");
+               return 0;
+       }
+       /*
+        * The space will be released later when all options are confirmed
+        * to be correct
+        */
+       sbi->s_qf_names[qtype] = NULL;
+       return 1;
+}
+#endif
 
 static int parse_options(char *options, struct super_block *sb,
                         unsigned long *journal_devnum,
@@ -1217,8 +1320,7 @@ static int parse_options(char *options, struct super_block *sb,
        int data_opt = 0;
        int option;
 #ifdef CONFIG_QUOTA
-       int qtype, qfmt;
-       char *qname;
+       int qfmt;
 #endif
 
        if (!options)
@@ -1229,19 +1331,31 @@ static int parse_options(char *options, struct super_block *sb,
                if (!*p)
                        continue;
 
+               /*
+                * Initialize args struct so we know whether arg was
+                * found; some options take optional arguments.
+                */
+               args[0].to = args[0].from = 0;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
+                       ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        clear_opt(sbi->s_mount_opt, MINIX_DF);
                        break;
                case Opt_minix_df:
+                       ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        set_opt(sbi->s_mount_opt, MINIX_DF);
+
                        break;
                case Opt_grpid:
+                       ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        set_opt(sbi->s_mount_opt, GRPID);
+
                        break;
                case Opt_nogrpid:
+                       ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        clear_opt(sbi->s_mount_opt, GRPID);
+
                        break;
                case Opt_resuid:
                        if (match_int(&args[0], &option))
@@ -1378,14 +1492,13 @@ static int parse_options(char *options, struct super_block *sb,
                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
                datacheck:
                        if (is_remount) {
-                               if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
-                                               != data_opt) {
+                               if (test_opt(sb, DATA_FLAGS) != data_opt) {
                                        ext4_msg(sb, KERN_ERR,
                                                "Cannot change data mode on remount");
                                        return 0;
                                }
                        } else {
-                               sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+                               clear_opt(sbi->s_mount_opt, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
@@ -1397,63 +1510,22 @@ static int parse_options(char *options, struct super_block *sb,
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
-                       qtype = USRQUOTA;
-                       goto set_qf_name;
-               case Opt_grpjquota:
-                       qtype = GRPQUOTA;
-set_qf_name:
-                       if (sb_any_quota_loaded(sb) &&
-                           !sbi->s_qf_names[qtype]) {
-                               ext4_msg(sb, KERN_ERR,
-                                      "Cannot change journaled "
-                                      "quota options when quota turned on");
+                       if (!set_qf_name(sb, USRQUOTA, &args[0]))
                                return 0;
-                       }
-                       qname = match_strdup(&args[0]);
-                       if (!qname) {
-                               ext4_msg(sb, KERN_ERR,
-                                       "Not enough memory for "
-                                       "storing quotafile name");
-                               return 0;
-                       }
-                       if (sbi->s_qf_names[qtype] &&
-                           strcmp(sbi->s_qf_names[qtype], qname)) {
-                               ext4_msg(sb, KERN_ERR,
-                                       "%s quota file already "
-                                       "specified", QTYPE2NAME(qtype));
-                               kfree(qname);
-                               return 0;
-                       }
-                       sbi->s_qf_names[qtype] = qname;
-                       if (strchr(sbi->s_qf_names[qtype], '/')) {
-                               ext4_msg(sb, KERN_ERR,
-                                       "quotafile must be on "
-                                       "filesystem root");
-                               kfree(sbi->s_qf_names[qtype]);
-                               sbi->s_qf_names[qtype] = NULL;
+                       break;
+               case Opt_grpjquota:
+                       if (!set_qf_name(sb, GRPQUOTA, &args[0]))
                                return 0;
-                       }
-                       set_opt(sbi->s_mount_opt, QUOTA);
                        break;
                case Opt_offusrjquota:
-                       qtype = USRQUOTA;
-                       goto clear_qf_name;
+                       if (!clear_qf_name(sb, USRQUOTA))
+                               return 0;
+                       break;
                case Opt_offgrpjquota:
-                       qtype = GRPQUOTA;
-clear_qf_name:
-                       if (sb_any_quota_loaded(sb) &&
-                           sbi->s_qf_names[qtype]) {
-                               ext4_msg(sb, KERN_ERR, "Cannot change "
-                                       "journaled quota options when "
-                                       "quota turned on");
+                       if (!clear_qf_name(sb, GRPQUOTA))
                                return 0;
-                       }
-                       /*
-                        * The space will be released later when all options
-                        * are confirmed to be correct
-                        */
-                       sbi->s_qf_names[qtype] = NULL;
                        break;
+
                case Opt_jqfmt_vfsold:
                        qfmt = QFMT_VFS_OLD;
                        goto set_qf_format;
@@ -1518,10 +1590,11 @@ set_qf_format:
                        clear_opt(sbi->s_mount_opt, BARRIER);
                        break;
                case Opt_barrier:
-                       if (match_int(&args[0], &option)) {
-                               set_opt(sbi->s_mount_opt, BARRIER);
-                               break;
-                       }
+                       if (args[0].from) {
+                               if (match_int(&args[0], &option))
+                                       return 0;
+                       } else
+                               option = 1;     /* No argument, default to 1 */
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1594,10 +1667,11 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
-                       if (match_int(&args[0], &option)) {
-                               clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
-                               break;
-                       }
+                       if (args[0].from) {
+                               if (match_int(&args[0], &option))
+                                       return 0;
+                       } else
+                               option = 1;     /* No argument, default to 1 */
                        if (option)
                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
                        else
@@ -1609,6 +1683,12 @@ set_qf_format:
                case Opt_nodiscard:
                        clear_opt(sbi->s_mount_opt, DISCARD);
                        break;
+               case Opt_dioread_nolock:
+                       set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                       break;
+               case Opt_dioread_lock:
+                       clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                       break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1618,18 +1698,13 @@ set_qf_format:
        }
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-               if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
-                    sbi->s_qf_names[USRQUOTA])
+               if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
                        clear_opt(sbi->s_mount_opt, USRQUOTA);
 
-               if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
-                    sbi->s_qf_names[GRPQUOTA])
+               if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
 
-               if ((sbi->s_qf_names[USRQUOTA] &&
-                               (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
-                   (sbi->s_qf_names[GRPQUOTA] &&
-                               (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+               if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
                                        "format mixing");
                        return 0;
@@ -2432,8 +2507,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sbi->s_mount_opt, DEBUG);
-       if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+       if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+               ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
+                       "2.6.38");
                set_opt(sbi->s_mount_opt, GRPID);
+       }
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sbi->s_mount_opt, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -2445,11 +2523,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-               sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+               set_opt(sbi->s_mount_opt, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-               sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+               set_opt(sbi->s_mount_opt, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-               sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+               set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
 
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2477,7 +2555,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
 
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-               ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+               (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2766,7 +2844,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
-               goto failed_mount4;
+               goto failed_mount_wq;
        } else {
                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2779,7 +2857,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
-               goto failed_mount4;
+               goto failed_mount_wq;
        }
 
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2818,7 +2896,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        ext4_msg(sb, KERN_ERR, "Journal does not support "
                               "requested data journaling mode");
-                       goto failed_mount4;
+                       goto failed_mount_wq;
                }
        default:
                break;
@@ -2826,13 +2904,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
-
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
                                "its supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
+               if (test_opt(sb, DIOREAD_NOLOCK)) {
+                       ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
+                               "not supported with nobh mode");
+                       goto failed_mount_wq;
+               }
        }
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2897,6 +2979,18 @@ no_journal:
                         "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
        }
+       if (test_opt(sb, DIOREAD_NOLOCK)) {
+               if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                       ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+                               "option - requested data journaling mode");
+                       clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+               }
+               if (sb->s_blocksize < PAGE_SIZE) {
+                       ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+                               "option - block size is too small");
+                       clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+               }
+       }
 
        err = ext4_setup_system_zone(sb);
        if (err) {
@@ -3360,10 +3454,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
                char nbuf[16];
 
                errstr = ext4_decode_error(sb, j_errno, nbuf);
-               ext4_warning(sb, __func__, "Filesystem error recorded "
+               ext4_warning(sb, "Filesystem error recorded "
                             "from previous mount: %s", errstr);
-               ext4_warning(sb, __func__, "Marking fs in need of "
-                            "filesystem check.");
+               ext4_warning(sb, "Marking fs in need of filesystem check.");
 
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3514,7 +3607,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                ext4_abort(sb, __func__, "Abort forced by user");
 
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-               ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+               (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
        es = sbi->s_es;
 
@@ -3917,9 +4010,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
-       int tocopy;
        int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
-       size_t towrite = len;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
 
@@ -3929,52 +4020,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
+       /*
+        * Since we account only one data block in transaction credits,
+        * then it is impossible to cross a block boundary.
+        */
+       if (sb->s_blocksize - offset < len) {
+               ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+                       " cancelled because not block aligned",
+                       (unsigned long long)off, (unsigned long long)len);
+               return -EIO;
+       }
+
        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-       while (towrite > 0) {
-               tocopy = sb->s_blocksize - offset < towrite ?
-                               sb->s_blocksize - offset : towrite;
-               bh = ext4_bread(handle, inode, blk, 1, &err);
-               if (!bh)
+       bh = ext4_bread(handle, inode, blk, 1, &err);
+       if (!bh)
+               goto out;
+       if (journal_quota) {
+               err = ext4_journal_get_write_access(handle, bh);
+               if (err) {
+                       brelse(bh);
                        goto out;
-               if (journal_quota) {
-                       err = ext4_journal_get_write_access(handle, bh);
-                       if (err) {
-                               brelse(bh);
-                               goto out;
-                       }
                }
-               lock_buffer(bh);
-               memcpy(bh->b_data+offset, data, tocopy);
-               flush_dcache_page(bh->b_page);
-               unlock_buffer(bh);
-               if (journal_quota)
-                       err = ext4_handle_dirty_metadata(handle, NULL, bh);
-               else {
-                       /* Always do at least ordered writes for quotas */
-                       err = ext4_jbd2_file_inode(handle, inode);
-                       mark_buffer_dirty(bh);
-               }
-               brelse(bh);
-               if (err)
-                       goto out;
-               offset = 0;
-               towrite -= tocopy;
-               data += tocopy;
-               blk++;
        }
+       lock_buffer(bh);
+       memcpy(bh->b_data+offset, data, len);
+       flush_dcache_page(bh->b_page);
+       unlock_buffer(bh);
+       if (journal_quota)
+               err = ext4_handle_dirty_metadata(handle, NULL, bh);
+       else {
+               /* Always do at least ordered writes for quotas */
+               err = ext4_jbd2_file_inode(handle, inode);
+               mark_buffer_dirty(bh);
+       }
+       brelse(bh);
 out:
-       if (len == towrite) {
+       if (err) {
                mutex_unlock(&inode->i_mutex);
                return err;
        }
-       if (inode->i_size < off+len-towrite) {
-               i_size_write(inode, off+len-towrite);
+       if (inode->i_size < off + len) {
+               i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
        }
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
-       return len - towrite;
+       return len;
 }
 
 #endif
index f3a2f7e..efc16a4 100644 (file)
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-bad_block:     ext4_error(inode->i_sb, __func__,
+bad_block:
+               ext4_error(inode->i_sb,
                           "inode %lu: bad block %llu", inode->i_ino,
                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
        void *end;
        int error;
 
-       if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return -ENODATA;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-               ext4_error(inode->i_sb, __func__,
+               ext4_error(inode->i_sb,
                           "inode %lu: bad block %llu", inode->i_ino,
                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
@@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        void *end;
        int error;
 
-       if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return 0;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext4_xattr_check_block(bs->bh)) {
-                       ext4_error(sb, __func__,
-                               "inode %lu: bad block %llu", inode->i_ino,
-                               EXT4_I(inode)->i_file_acl);
+                       ext4_error(sb, "inode %lu: bad block %llu",
+                                  inode->i_ino, EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -880,9 +880,8 @@ cleanup_dquot:
        goto cleanup;
 
 bad_block:
-       ext4_error(inode->i_sb, __func__,
-                  "inode %lu: bad block %llu", inode->i_ino,
-                  EXT4_I(inode)->i_file_acl);
+       ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                  inode->i_ino, EXT4_I(inode)->i_file_acl);
        goto cleanup;
 
 #undef header
@@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
        is->s.base = is->s.first = IFIRST(header);
        is->s.here = is->s.first;
        is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-       if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+       if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                error = ext4_xattr_check_names(IFIRST(header), is->s.end);
                if (error)
                        return error;
@@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
        header = IHDR(inode, ext4_raw_inode(&is->iloc));
        if (!IS_LAST_ENTRY(s->first)) {
                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
-               EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+               ext4_set_inode_state(inode, EXT4_STATE_XATTR);
        } else {
                header->h_magic = cpu_to_le32(0);
-               EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+               ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
        }
        return 0;
 }
@@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (strlen(name) > 255)
                return -ERANGE;
        down_write(&EXT4_I(inode)->xattr_sem);
-       no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
-       EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+       no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+       ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 
        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
@@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (error)
                goto cleanup;
 
-       if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+       if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
-               EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+               ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        }
 
        error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                ext4_xattr_update_super_block(handle, inode->i_sb);
                inode->i_ctime = ext4_current_time(inode);
                if (!value)
-                       EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+                       ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
                error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
                /*
                 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1066,7 @@ cleanup:
        brelse(is.iloc.bh);
        brelse(bs.bh);
        if (no_expand == 0)
-               EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+               ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
        return error;
 }
@@ -1195,9 +1194,8 @@ retry:
                if (!bh)
                        goto cleanup;
                if (ext4_xattr_check_block(bh)) {
-                       ext4_error(inode->i_sb, __func__,
-                               "inode %lu: bad block %llu", inode->i_ino,
-                               EXT4_I(inode)->i_file_acl);
+                       ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                                  inode->i_ino, EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -1302,6 +1300,8 @@ retry:
 
                /* Remove the chosen entry from the inode */
                error = ext4_xattr_ibody_set(handle, inode, &i, is);
+               if (error)
+                       goto cleanup;
 
                entry = IFIRST(header);
                if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
                goto cleanup;
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh) {
-               ext4_error(inode->i_sb, __func__,
-                       "inode %lu: block %llu read error", inode->i_ino,
-                       EXT4_I(inode)->i_file_acl);
+               ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+                          inode->i_ino, EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-               ext4_error(inode->i_sb, __func__,
-                       "inode %lu: bad block %llu", inode->i_ino,
-                       EXT4_I(inode)->i_file_acl);
+               ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                          inode->i_ino, EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        ext4_xattr_release_block(handle, inode, bh);
@@ -1506,7 +1504,7 @@ again:
                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
-                       ext4_error(inode->i_sb, __func__,
+                       ext4_error(inode->i_sb,
                                "inode %lu: block %lu read error",
                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
index 8868493..30beb11 100644 (file)
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        if (blocknr < journal->j_tail)
                freed = freed + journal->j_last - journal->j_first;
 
+       trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
        jbd_debug(1,
                  "Cleaning journal tail from %d to %d (offset %lu), "
                  "freeing %lu\n",
index 1bc74b6..671da7f 100644 (file)
@@ -883,8 +883,7 @@ restart_loop:
                spin_unlock(&journal->j_list_lock);
                bh = jh2bh(jh);
                jbd_lock_bh_state(bh);
-               J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
-                       jh->b_transaction == journal->j_running_transaction);
+               J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 
                /*
                 * If there is undo-protected committed data against
@@ -930,12 +929,12 @@ restart_loop:
                /* A buffer which has been freed while still being
                 * journaled by a previous transaction may end up still
                 * being dirty here, but we want to avoid writing back
-                * that buffer in the future now that the last use has
-                * been committed.  That's not only a performance gain,
-                * it also stops aliasing problems if the buffer is left
-                * behind for writeback and gets reallocated for another
+                * that buffer in the future after the "add to orphan"
+                * operation been committed,  That's not only a performance
+                * gain, it also stops aliasing problems if the buffer is
+                * left behind for writeback and gets reallocated for another
                 * use in a different page. */
-               if (buffer_freed(bh)) {
+               if (buffer_freed(bh) && !jh->b_next_transaction) {
                        clear_buffer_freed(bh);
                        clear_buffer_jbddirty(bh);
                }
index ac0d027..c03d4dc 100644 (file)
@@ -39,6 +39,8 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
                }
        }
 
+       /*
+        * Create a slab for this blocksize
+        */
+       err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+       if (err)
+               return err;
+
        /* Let the recovery code check whether it needs to recover any
         * data from the journal. */
        if (jbd2_journal_recover(journal))
@@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
 }
 
 /*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+       "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+       "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+       int i;
+
+       for (i = 0; i < JBD2_MAX_SLABS; i++) {
+               if (jbd2_slab[i])
+                       kmem_cache_destroy(jbd2_slab[i]);
+               jbd2_slab[i] = NULL;
+       }
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+       int i = order_base_2(size) - 10;
+       size_t slab_size;
+
+       if (size == PAGE_SIZE)
+               return 0;
+
+       if (i >= JBD2_MAX_SLABS)
+               return -EINVAL;
+
+       if (unlikely(i < 0))
+               i = 0;
+       down(&jbd2_slab_create_sem);
+       if (jbd2_slab[i]) {
+               up(&jbd2_slab_create_sem);
+               return 0;       /* Already created */
+       }
+
+       slab_size = 1 << (i+10);
+       jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+                                        slab_size, 0, NULL);
+       up(&jbd2_slab_create_sem);
+       if (!jbd2_slab[i]) {
+               printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+       int i = order_base_2(size) - 10;
+
+       BUG_ON(i >= JBD2_MAX_SLABS);
+       if (unlikely(i < 0))
+               i = 0;
+       BUG_ON(jbd2_slab[i] == 0);
+       return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+       void *ptr;
+
+       BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+       flags |= __GFP_REPEAT;
+       if (size == PAGE_SIZE)
+               ptr = (void *)__get_free_pages(flags, 0);
+       else if (size > PAGE_SIZE) {
+               int order = get_order(size);
+
+               if (order < 3)
+                       ptr = (void *)__get_free_pages(flags, order);
+               else
+                       ptr = vmalloc(size);
+       } else
+               ptr = kmem_cache_alloc(get_slab(size), flags);
+
+       /* Check alignment; SLUB has gotten this wrong in the past,
+        * and this can lead to user data corruption! */
+       BUG_ON(((unsigned long) ptr) & (size-1));
+
+       return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+       if (size == PAGE_SIZE) {
+               free_pages((unsigned long)ptr, 0);
+               return;
+       }
+       if (size > PAGE_SIZE) {
+               int order = get_order(size);
+
+               if (order < 3)
+                       free_pages((unsigned long)ptr, order);
+               else
+                       vfree(ptr);
+               return;
+       }
+       kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
  * Journal_head storage management
  */
 static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
        jbd2_journal_destroy_revoke_caches();
        jbd2_journal_destroy_jbd2_journal_head_cache();
        jbd2_journal_destroy_handle_cache();
+       jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
index a051270..bfc70f5 100644 (file)
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        if (!jh)
                goto zap_buffer_no_jh;
 
+       /*
+        * We cannot remove the buffer from checkpoint lists until the
+        * transaction adding inode to orphan list (let's call it T)
+        * is committed.  Otherwise if the transaction changing the
+        * buffer would be cleaned from the journal before T is
+        * committed, a crash will cause that the correct contents of
+        * the buffer will be lost.  On the other hand we have to
+        * clear the buffer dirty bit at latest at the moment when the
+        * transaction marking the buffer as freed in the filesystem
+        * structures is committed because from that moment on the
+        * buffer can be reallocated and used by a different page.
+        * Since the block hasn't been freed yet but the inode has
+        * already been added to orphan list, it is safe for us to add
+        * the buffer to BJ_Forget list of the newest transaction.
+        */
        transaction = jh->b_transaction;
        if (transaction == NULL) {
                /* First case: not on any transaction.  If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
                /*
-                * If it is committing, we simply cannot touch it.  We
-                * can remove it's next_transaction pointer from the
-                * running transaction if that is set, but nothing
-                * else. */
+                * The buffer is committing, we simply cannot touch
+                * it. So we just set j_next_transaction to the
+                * running transaction (if there is one) and mark
+                * buffer as freed so that commit code knows it should
+                * clear dirty bits when it is done with the buffer.
+                */
                set_buffer_freed(bh);
-               if (jh->b_next_transaction) {
-                       J_ASSERT(jh->b_next_transaction ==
-                                       journal->j_running_transaction);
-                       jh->b_next_transaction = NULL;
-               }
+               if (journal->j_running_transaction && buffer_jbddirty(bh))
+                       jh->b_next_transaction = journal->j_running_transaction;
                jbd2_journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
  */
 void __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
-       int was_dirty;
+       int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);
 
        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
-       __jbd2_journal_file_buffer(jh, jh->b_transaction,
-                               jh->b_modified ? BJ_Metadata : BJ_Reserved);
+       if (buffer_freed(bh))
+               jlist = BJ_Forget;
+       else if (jh->b_modified)
+               jlist = BJ_Metadata;
+       else
+               jlist = BJ_Reserved;
+       __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
        if (was_dirty)
index 0741c69..9a64560 100644 (file)
@@ -498,8 +498,6 @@ static int link_path_walk(const char *, struct nameidata *);
 
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
-       int res = 0;
-       char *name;
        if (IS_ERR(link))
                goto fail;
 
@@ -510,22 +508,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_get(&nd->root);
        }
 
-       res = link_path_walk(link, nd);
-       if (nd->depth || res || nd->last_type!=LAST_NORM)
-               return res;
-       /*
-        * If it is an iterative symlinks resolution in open_namei() we
-        * have to copy the last component. And all that crap because of
-        * bloody create() on broken symlinks. Furrfu...
-        */
-       name = __getname();
-       if (unlikely(!name)) {
-               path_put(&nd->path);
-               return -ENOMEM;
-       }
-       strcpy(name, nd->last.name);
-       nd->last.name = name;
-       return 0;
+       return link_path_walk(link, nd);
 fail:
        path_put(&nd->path);
        return PTR_ERR(link);
@@ -547,10 +530,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
        nd->path.dentry = path->dentry;
 }
 
-static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
+static __always_inline int
+__do_follow_link(struct path *path, struct nameidata *nd, void **p)
 {
        int error;
-       void *cookie;
        struct dentry *dentry = path->dentry;
 
        touch_atime(path->mnt, dentry);
@@ -562,9 +545,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
        }
        mntget(path->mnt);
        nd->last_type = LAST_BIND;
-       cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
-       error = PTR_ERR(cookie);
-       if (!IS_ERR(cookie)) {
+       *p = dentry->d_inode->i_op->follow_link(dentry, nd);
+       error = PTR_ERR(*p);
+       if (!IS_ERR(*p)) {
                char *s = nd_get_link(nd);
                error = 0;
                if (s)
@@ -574,8 +557,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
                        if (error)
                                path_put(&nd->path);
                }
-               if (dentry->d_inode->i_op->put_link)
-                       dentry->d_inode->i_op->put_link(dentry, nd, cookie);
        }
        return error;
 }
@@ -589,6 +570,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
  */
 static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
+       void *cookie;
        int err = -ELOOP;
        if (current->link_count >= MAX_NESTED_LINKS)
                goto loop;
@@ -602,7 +584,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
        current->link_count++;
        current->total_link_count++;
        nd->depth++;
-       err = __do_follow_link(path, nd);
+       err = __do_follow_link(path, nd, &cookie);
+       if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
+               path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
        path_put(path);
        current->link_count--;
        nd->depth--;
@@ -1375,22 +1359,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
-/* 
- * O_DIRECTORY translates into forcing a directory lookup.
- */
-static inline int lookup_flags(unsigned int f)
-{
-       unsigned long retval = LOOKUP_FOLLOW;
-
-       if (f & O_NOFOLLOW)
-               retval &= ~LOOKUP_FOLLOW;
-       
-       if (f & O_DIRECTORY)
-               retval |= LOOKUP_DIRECTORY;
-
-       return retval;
-}
-
 /*
  * p1 and p2 should be directories on the same fs.
  */
@@ -1590,129 +1558,135 @@ static int open_will_truncate(int flag, struct inode *inode)
        return (flag & O_TRUNC);
 }
 
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-               int open_flag, int mode, int acc_mode)
+static struct file *finish_open(struct nameidata *nd,
+                               int open_flag, int acc_mode)
 {
        struct file *filp;
-       struct nameidata nd;
-       int error;
-       struct path path;
-       struct dentry *dir;
-       int count = 0;
        int will_truncate;
-       int flag = open_to_namei_flags(open_flag);
-       int force_reval = 0;
+       int error;
 
+       will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
+       if (will_truncate) {
+               error = mnt_want_write(nd->path.mnt);
+               if (error)
+                       goto exit;
+       }
+       error = may_open(&nd->path, acc_mode, open_flag);
+       if (error) {
+               if (will_truncate)
+                       mnt_drop_write(nd->path.mnt);
+               goto exit;
+       }
+       filp = nameidata_to_filp(nd);
+       if (!IS_ERR(filp)) {
+               error = ima_file_check(filp, acc_mode);
+               if (error) {
+                       fput(filp);
+                       filp = ERR_PTR(error);
+               }
+       }
+       if (!IS_ERR(filp)) {
+               if (acc_mode & MAY_WRITE)
+                       vfs_dq_init(nd->path.dentry->d_inode);
+
+               if (will_truncate) {
+                       error = handle_truncate(&nd->path);
+                       if (error) {
+                               fput(filp);
+                               filp = ERR_PTR(error);
+                       }
+               }
+       }
        /*
-        * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-        * check for O_DSYNC if the need any syncing at all we enforce it's
-        * always set instead of having to deal with possibly weird behaviour
-        * for malicious applications setting only __O_SYNC.
+        * It is now safe to drop the mnt write
+        * because the filp has had a write taken
+        * on its behalf.
         */
-       if (open_flag & __O_SYNC)
-               open_flag |= O_DSYNC;
-
-       if (!acc_mode)
-               acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+       if (will_truncate)
+               mnt_drop_write(nd->path.mnt);
+       return filp;
 
-       /* O_TRUNC implies we need access checks for write permissions */
-       if (flag & O_TRUNC)
-               acc_mode |= MAY_WRITE;
+exit:
+       if (!IS_ERR(nd->intent.open.file))
+               release_open_intent(nd);
+       path_put(&nd->path);
+       return ERR_PTR(error);
+}
 
-       /* Allow the LSM permission hook to distinguish append 
-          access from general write access. */
-       if (flag & O_APPEND)
-               acc_mode |= MAY_APPEND;
+static struct file *do_last(struct nameidata *nd, struct path *path,
+                           int open_flag, int acc_mode,
+                           int mode, const char *pathname,
+                           int *want_dir)
+{
+       struct dentry *dir = nd->path.dentry;
+       struct file *filp;
+       int error = -EISDIR;
 
-       /*
-        * The simplest case - just a plain lookup.
-        */
-       if (!(flag & O_CREAT)) {
-               filp = get_empty_filp();
-
-               if (filp == NULL)
-                       return ERR_PTR(-ENFILE);
-               nd.intent.open.file = filp;
-               filp->f_flags = open_flag;
-               nd.intent.open.flags = flag;
-               nd.intent.open.create_mode = 0;
-               error = do_path_lookup(dfd, pathname,
-                                       lookup_flags(flag)|LOOKUP_OPEN, &nd);
-               if (IS_ERR(nd.intent.open.file)) {
-                       if (error == 0) {
-                               error = PTR_ERR(nd.intent.open.file);
-                               path_put(&nd.path);
+       switch (nd->last_type) {
+       case LAST_DOTDOT:
+               follow_dotdot(nd);
+               dir = nd->path.dentry;
+               if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+                       if (!dir->d_op->d_revalidate(dir, nd)) {
+                               error = -ESTALE;
+                               goto exit;
                        }
-               } else if (error)
-                       release_open_intent(&nd);
-               if (error)
-                       return ERR_PTR(error);
+               }
+               /* fallthrough */
+       case LAST_DOT:
+       case LAST_ROOT:
+               if (open_flag & O_CREAT)
+                       goto exit;
+               /* fallthrough */
+       case LAST_BIND:
+               audit_inode(pathname, dir);
                goto ok;
        }
 
-       /*
-        * Create - we need to know the parent.
-        */
-reval:
-       error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
-       if (error)
-               return ERR_PTR(error);
-       if (force_reval)
-               nd.flags |= LOOKUP_REVAL;
-       error = path_walk(pathname, &nd);
-