Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 18 Jan 2012 00:43:39 +0000 (16:43 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 18 Jan 2012 00:43:39 +0000 (16:43 -0800)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security:
  integrity: digital signature config option name change
  lib: Removed MPILIB, MPILIB_EXTRA, and SIGNATURE prompts
  lib: MPILIB Kconfig description update
  lib: digital signature dependency fix
  lib: digital signature config option name change
  encrypted-keys: fix rcu and sparse messages
  keys: fix trusted/encrypted keys sparse rcu_assign_pointer messages
  KEYS: Add missing smp_rmb() primitives to the keyring search code
  TOMOYO: Accept \000 as a valid character.
  security: update MAINTAINERS file with new git repo

111 files changed:
arch/arm/include/asm/kprobes.h
arch/arm/include/asm/ptrace.h
arch/arm/include/asm/thread_info.h
arch/arm/kernel/entry-common.S
arch/arm/kernel/ptrace.c
arch/ia64/include/asm/ptrace.h
arch/ia64/kernel/ptrace.c
arch/microblaze/include/asm/ptrace.h
arch/microblaze/kernel/ptrace.c
arch/microblaze/kernel/setup.c
arch/mips/include/asm/ptrace.h
arch/mips/kernel/ptrace.c
arch/powerpc/include/asm/ptrace.h
arch/powerpc/kernel/ptrace.c
arch/s390/include/asm/ptrace.h
arch/s390/kernel/ptrace.c
arch/sh/include/asm/ptrace_32.h
arch/sh/include/asm/ptrace_64.h
arch/sh/kernel/ptrace_32.c
arch/sh/kernel/ptrace_64.c
arch/sparc/include/asm/ptrace.h
arch/sparc/kernel/ptrace_64.c
arch/um/kernel/ptrace.c
arch/x86/ia32/ia32entry.S
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/ptrace.c
arch/x86/kernel/vm86_32.c
arch/x86/um/shared/sysdep/ptrace.h
arch/xtensa/kernel/ptrace.c
block/cfq-iosched.c
drivers/usb/host/ehci-xilinx-of.c
drivers/xen/xen-balloon.c
fs/btrfs/Kconfig
fs/btrfs/Makefile
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c [new file with mode: 0644]
fs/btrfs/check-integrity.h [new file with mode: 0644]
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/locking.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/ulist.c [new file with mode: 0644]
fs/btrfs/ulist.h [new file with mode: 0644]
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/namei.c
fs/proc/base.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_attr.c
fs/xfs/xfs_attr_leaf.c
fs/xfs/xfs_bmap.c
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_file.c
fs/xfs/xfs_fs_subr.c
fs/xfs/xfs_iget.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_super.c
fs/xfs/xfs_sync.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_vnodeops.c
include/linux/audit.h
include/linux/kref.h
include/linux/ptrace.h
include/linux/tty_driver.h
include/trace/events/btrfs.h
init/Kconfig
kernel/audit.c
kernel/audit.h
kernel/auditfilter.c
kernel/auditsc.c
kernel/capability.c
kernel/exit.c
kernel/fork.c
kernel/seccomp.c
security/integrity/ima/ima_audit.c
security/lsm_audit.c
sound/core/Kconfig
sound/pci/au88x0/au88x0.c
sound/pci/au88x0/au88x0.h
sound/pci/au88x0/au88x0_pcm.c
sound/pci/hda/hda_intel.c
sound/pci/hda/patch_sigmatel.c
sound/pci/oxygen/xonar_wm87x6.c

index feec867..f82ec22 100644 (file)
@@ -24,7 +24,6 @@
 #define MAX_INSN_SIZE                  2
 #define MAX_STACK_SIZE                 64      /* 32 would probably be OK */
 
-#define regs_return_value(regs)                ((regs)->ARM_r0)
 #define flush_insn_slot(p)             do { } while (0)
 #define kretprobe_blacklist_size       0
 
index 96187ff..451808b 100644 (file)
@@ -189,6 +189,11 @@ static inline int valid_user_regs(struct pt_regs *regs)
        return 0;
 }
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->ARM_r0;
+}
+
 #define instruction_pointer(regs)      (regs)->ARM_pc
 
 #ifdef CONFIG_SMP
index 0f30c3a..d4c24d4 100644 (file)
@@ -129,6 +129,7 @@ extern void vfp_flush_hwstate(struct thread_info *);
 /*
  * thread information flags:
  *  TIF_SYSCALL_TRACE  - syscall trace active
+ *  TIF_SYSCAL_AUDIT   - syscall auditing active
  *  TIF_SIGPENDING     - signal pending
  *  TIF_NEED_RESCHED   - rescheduling necessary
  *  TIF_NOTIFY_RESUME  - callback before returning to user
@@ -139,6 +140,7 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define TIF_NEED_RESCHED       1
 #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
 #define TIF_SYSCALL_TRACE      8
+#define TIF_SYSCALL_AUDIT      9
 #define TIF_POLLING_NRFLAG     16
 #define TIF_USING_IWMMXT       17
 #define TIF_MEMDIE             18      /* is terminating due to OOM killer */
@@ -149,11 +151,15 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
+#define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT      (1 << TIF_USING_IWMMXT)
 #define _TIF_RESTORE_SIGMASK   (1 << TIF_RESTORE_SIGMASK)
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
 
+/* Checks for any syscall work in entry-common.S */
+#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT)
+
 /*
  * Change these and you break ASM code in entry-common.S
  */
index b2a27b6..520889c 100644 (file)
@@ -87,7 +87,7 @@ ENTRY(ret_from_fork)
        get_thread_info tsk
        ldr     r1, [tsk, #TI_FLAGS]            @ check for syscall tracing
        mov     why, #1
-       tst     r1, #_TIF_SYSCALL_TRACE         @ are we tracing syscalls?
+       tst     r1, #_TIF_SYSCALL_WORK          @ are we tracing syscalls?
        beq     ret_slow_syscall
        mov     r1, sp
        mov     r0, #1                          @ trace exit [IP = 1]
@@ -443,7 +443,7 @@ ENTRY(vector_swi)
 1:
 #endif
 
-       tst     r10, #_TIF_SYSCALL_TRACE                @ are we tracing syscalls?
+       tst     r10, #_TIF_SYSCALL_WORK         @ are we tracing syscalls?
        bne     __sys_trace
 
        cmp     scno, #NR_syscalls              @ check upper syscall limit
index 483727a..e1d5e19 100644 (file)
@@ -906,11 +906,6 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
 {
        unsigned long ip;
 
-       if (!test_thread_flag(TIF_SYSCALL_TRACE))
-               return scno;
-       if (!(current->ptrace & PT_PTRACED))
-               return scno;
-
        /*
         * Save IP.  IP is used to denote syscall entry/exit:
         *  IP = 0 -> entry, = 1 -> exit
@@ -918,6 +913,17 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
        ip = regs->ARM_ip;
        regs->ARM_ip = why;
 
+       if (!ip)
+               audit_syscall_exit(regs);
+       else
+               audit_syscall_entry(AUDIT_ARCH_ARMEB, scno, regs->ARM_r0,
+                                   regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
+
+       if (!test_thread_flag(TIF_SYSCALL_TRACE))
+               return scno;
+       if (!(current->ptrace & PT_PTRACED))
+               return scno;
+
        current_thread_info()->syscall = scno;
 
        /* the 0x80 provides a way for the tracing parent to distinguish
index f5cb276..68c98f5 100644 (file)
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
        return regs->ar_bspstore;
 }
 
-#define regs_return_value(regs) ((regs)->r8)
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return regs->r10 != -1;
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       if (is_syscall_success(regs))
+               return regs->r8;
+       else
+               return -regs->r8;
+}
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
index 8848f43..dad9166 100644 (file)
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
        if (test_thread_flag(TIF_RESTORE_RSE))
                ia64_sync_krbs();
 
-       if (unlikely(current->audit_context)) {
-               long syscall;
-               int arch;
 
-               syscall = regs.r15;
-               arch = AUDIT_ARCH_IA64;
-
-               audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
-       }
+       audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
 
        return 0;
 }
@@ -1268,14 +1261,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
        int step;
 
-       if (unlikely(current->audit_context)) {
-               int success = AUDITSC_RESULT(regs.r10);
-               long result = regs.r8;
-
-               if (success != AUDITSC_SUCCESS)
-                       result = -result;
-               audit_syscall_exit(success, result);
-       }
+       audit_syscall_exit(&regs);
 
        step = test_thread_flag(TIF_SINGLESTEP);
        if (step || test_thread_flag(TIF_SYSCALL_TRACE))
index 816bee6..94e92c8 100644 (file)
@@ -61,6 +61,11 @@ struct pt_regs {
 #define instruction_pointer(regs)      ((regs)->pc)
 #define profile_pc(regs)               instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->r3;
+}
+
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
index 043cb58..6eb2aa9 100644 (file)
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
                 */
                ret = -1L;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(EM_MICROBLAZE, regs->r12,
-                                   regs->r5, regs->r6,
-                                   regs->r7, regs->r8);
+       audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
+                           regs->r7, regs->r8);
 
        return ret ?: regs->r12;
 }
@@ -159,8 +157,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+       audit_syscall_exit(regs);
 
        step = test_thread_flag(TIF_SINGLESTEP);
        if (step || test_thread_flag(TIF_SYSCALL_TRACE))
index 604cd9d..d4fc1a9 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/cache.h>
 #include <linux/of_platform.h>
 #include <linux/dma-mapping.h>
+#include <linux/cpu.h>
 #include <asm/cacheflush.h>
 #include <asm/entry.h>
 #include <asm/cpuinfo.h>
@@ -226,5 +227,23 @@ static int __init setup_bus_notifier(void)
 
        return 0;
 }
-
 arch_initcall(setup_bus_notifier);
+
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
+
+static int __init topology_init(void)
+{
+       int i, ret;
+
+       for_each_present_cpu(i) {
+               struct cpu *c = &per_cpu(cpu_devices, i);
+
+               ret = register_cpu(c, i);
+               if (ret)
+                       printk(KERN_WARNING "topology_init: register_cpu %d "
+                                               "failed (%d)\n", i, ret);
+       }
+
+       return 0;
+}
+subsys_initcall(topology_init);
index 7b99c67..4b7f525 100644 (file)
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-#define regs_return_value(_regs) ((_regs)->regs[2])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return !regs->regs[7];
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       if (is_syscall_success(regs))
+               return regs->regs[2];
+       else
+               return -regs->regs[2];
+}
+
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
index 4e6ea1f..7786b60 100644 (file)
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
        }
 
 out:
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(audit_arch(), regs->regs[2],
-                                   regs->regs[4], regs->regs[5],
-                                   regs->regs[6], regs->regs[7]);
+       audit_syscall_entry(audit_arch(), regs->regs[2],
+                           regs->regs[4], regs->regs[5],
+                           regs->regs[6], regs->regs[7]);
 }
 
 /*
@@ -572,9 +571,7 @@ out:
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
-                                  -regs->regs[2]);
+       audit_syscall_exit(regs);
 
        if (!(current->ptrace & PT_PTRACED))
                return;
index 48223f9..78a2051 100644 (file)
@@ -86,7 +86,18 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-#define regs_return_value(regs) ((regs)->gpr[3])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return !(regs->ccr & 0x10000000);
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       if (is_syscall_success(regs))
+               return regs->gpr[3];
+       else
+               return -regs->gpr[3];
+}
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
index 5de73db..5b43325 100644 (file)
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->gpr[0]);
 
-       if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-               if (!is_32bit_task())
-                       audit_syscall_entry(AUDIT_ARCH_PPC64,
-                                           regs->gpr[0],
-                                           regs->gpr[3], regs->gpr[4],
-                                           regs->gpr[5], regs->gpr[6]);
-               else
+       if (!is_32bit_task())
+               audit_syscall_entry(AUDIT_ARCH_PPC64,
+                                   regs->gpr[0],
+                                   regs->gpr[3], regs->gpr[4],
+                                   regs->gpr[5], regs->gpr[6]);
+       else
 #endif
-                       audit_syscall_entry(AUDIT_ARCH_PPC,
-                                           regs->gpr[0],
-                                           regs->gpr[3] & 0xffffffff,
-                                           regs->gpr[4] & 0xffffffff,
-                                           regs->gpr[5] & 0xffffffff,
-                                           regs->gpr[6] & 0xffffffff);
-       }
+               audit_syscall_entry(AUDIT_ARCH_PPC,
+                                   regs->gpr[0],
+                                   regs->gpr[3] & 0xffffffff,
+                                   regs->gpr[4] & 0xffffffff,
+                                   regs->gpr[5] & 0xffffffff,
+                                   regs->gpr[6] & 0xffffffff);
 
        return ret ?: regs->gpr[0];
 }
@@ -1748,9 +1746,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
-                                  regs->result);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->result);
index 56da355..aeb77f0 100644 (file)
@@ -541,9 +541,13 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
-#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->gprs[2];
+}
+
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
index 573bc29..9d82ed4 100644 (file)
@@ -740,20 +740,17 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->gprs[2]);
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(is_compat_task() ?
-                                       AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-                                   regs->gprs[2], regs->orig_gpr2,
-                                   regs->gprs[3], regs->gprs[4],
-                                   regs->gprs[5]);
+       audit_syscall_entry(is_compat_task() ?
+                               AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+                           regs->gprs[2], regs->orig_gpr2,
+                           regs->gprs[3], regs->gprs[4],
+                           regs->gprs[5]);
        return ret ?: regs->gprs[2];
 }
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
-                                  regs->gprs[2]);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->gprs[2]);
index 6c2239c..2d3e906 100644 (file)
@@ -76,7 +76,10 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET         offsetof(struct pt_regs, tra)
-#define regs_return_value(_regs)       ((_regs)->regs[0])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->regs[0];
+}
 
 #endif /* __KERNEL__ */
 
index bf9be77..eb3fcce 100644 (file)
@@ -13,7 +13,10 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET         offsetof(struct pt_regs, tregs[7])
-#define regs_return_value(_regs)       ((_regs)->regs[3])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->regs[3];
+}
 
 #endif /* __KERNEL__ */
 
index 92b3c27..a3e6515 100644 (file)
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->regs[0]);
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(audit_arch(), regs->regs[3],
-                                   regs->regs[4], regs->regs[5],
-                                   regs->regs[6], regs->regs[7]);
+       audit_syscall_entry(audit_arch(), regs->regs[3],
+                           regs->regs[4], regs->regs[5],
+                           regs->regs[6], regs->regs[7]);
 
        return ret ?: regs->regs[0];
 }
@@ -530,9 +529,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
-                                  regs->regs[0]);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->regs[0]);
index c8f9764..3d0080b 100644 (file)
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->regs[9]);
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(audit_arch(), regs->regs[1],
-                                   regs->regs[2], regs->regs[3],
-                                   regs->regs[4], regs->regs[5]);
+       audit_syscall_entry(audit_arch(), regs->regs[1],
+                           regs->regs[2], regs->regs[3],
+                           regs->regs[4], regs->regs[5]);
 
        return ret ?: regs->regs[9];
 }
@@ -548,9 +547,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
-                                  regs->regs[9]);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->regs[9]);
index a0e1bcf..c00c3b5 100644 (file)
@@ -207,7 +207,15 @@ do {       current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->u_regs[UREG_I0];
+}
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
index 96ee50a..9388844 100644 (file)
@@ -1071,32 +1071,22 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-       if (unlikely(current->audit_context) && !ret)
-               audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-                                    AUDIT_ARCH_SPARC :
-                                    AUDIT_ARCH_SPARC64),
-                                   regs->u_regs[UREG_G1],
-                                   regs->u_regs[UREG_I0],
-                                   regs->u_regs[UREG_I1],
-                                   regs->u_regs[UREG_I2],
-                                   regs->u_regs[UREG_I3]);
+       audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+                            AUDIT_ARCH_SPARC :
+                            AUDIT_ARCH_SPARC64),
+                           regs->u_regs[UREG_G1],
+                           regs->u_regs[UREG_I0],
+                           regs->u_regs[UREG_I1],
+                           regs->u_regs[UREG_I2],
+                           regs->u_regs[UREG_I3]);
 
        return ret;
 }
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-#ifdef CONFIG_AUDITSYSCALL
-       if (unlikely(current->audit_context)) {
-               unsigned long tstate = regs->tstate;
-               int result = AUDITSC_SUCCESS;
+       audit_syscall_exit(regs);
 
-               if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
-                       result = AUDITSC_FAILURE;
-
-               audit_syscall_exit(result, regs->u_regs[UREG_I0]);
-       }
-#endif
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
index c9da32b..06b1903 100644 (file)
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
        int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
        int tracesysgood;
 
-       if (unlikely(current->audit_context)) {
-               if (!entryexit)
-                       audit_syscall_entry(HOST_AUDIT_ARCH,
-                                           UPT_SYSCALL_NR(regs),
-                                           UPT_SYSCALL_ARG1(regs),
-                                           UPT_SYSCALL_ARG2(regs),
-                                           UPT_SYSCALL_ARG3(regs),
-                                           UPT_SYSCALL_ARG4(regs));
-               else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
-                                       UPT_SYSCALL_RET(regs));
-       }
+       if (!entryexit)
+               audit_syscall_entry(HOST_AUDIT_ARCH,
+                                   UPT_SYSCALL_NR(regs),
+                                   UPT_SYSCALL_ARG1(regs),
+                                   UPT_SYSCALL_ARG2(regs),
+                                   UPT_SYSCALL_ARG3(regs),
+                                   UPT_SYSCALL_ARG4(regs));
+       else
+               audit_syscall_exit(regs);
 
        /* Fake a debug trap */
        if (is_singlestep)
index 1106261..e3e7340 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -189,7 +190,7 @@ sysexit_from_sys_call:
        movl %ebx,%edx                  /* 3rd arg: 1st syscall arg */
        movl %eax,%esi                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_I386,%edi      /* 1st arg: audit arch */
-       call audit_syscall_entry
+       call __audit_syscall_entry
        movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall number */
        cmpq $(IA32_NR_syscalls-1),%rax
        ja ia32_badsys
@@ -206,12 +207,13 @@ sysexit_from_sys_call:
        TRACE_IRQS_ON
        sti
        movl %eax,%esi          /* second arg, syscall return value */
-       cmpl $0,%eax            /* is it < 0? */
-       setl %al                /* 1 if so, 0 if not */
+       cmpl $-MAX_ERRNO,%eax   /* is it an error ? */
+       jbe 1f
+       movslq %eax, %rsi       /* if error sign extend to 64 bits */
+1:     setbe %al               /* 1 if error, 0 if not */
        movzbl %al,%edi         /* zero-extend that into %edi */
-       inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-       call audit_syscall_exit
-       movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall return value */
+       call __audit_syscall_exit
+       movq RAX-ARGOFFSET(%rsp),%rax   /* reload syscall return value */
        movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
        cli
        TRACE_IRQS_OFF
index 4af9fd2..79d97e6 100644 (file)
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -453,7 +454,7 @@ sysenter_audit:
        movl %ebx,%ecx                  /* 3rd arg: 1st syscall arg */
        movl %eax,%edx                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
-       call audit_syscall_entry
+       call __audit_syscall_entry
        pushl_cfi %ebx
        movl PT_EAX(%esp),%eax          /* reload syscall number */
        jmp sysenter_do_call
@@ -464,11 +465,10 @@ sysexit_audit:
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_ANY)
        movl %eax,%edx          /* second arg, syscall return value */
-       cmpl $0,%eax            /* is it < 0? */
-       setl %al                /* 1 if so, 0 if not */
+       cmpl $-MAX_ERRNO,%eax   /* is it an error ? */
+       setbe %al               /* 1 if so, 0 if not */
        movzbl %al,%eax         /* zero-extend that */
-       inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-       call audit_syscall_exit
+       call __audit_syscall_exit
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
index 940ba71..3fe8239 100644 (file)
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -548,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
        /*
         * Fast path for syscall audit without full syscall trace.
-        * We just call audit_syscall_entry() directly, and then
+        * We just call __audit_syscall_entry() directly, and then
         * jump back to the normal fast path.
         */
 auditsys:
@@ -558,22 +559,21 @@ auditsys:
        movq %rdi,%rdx                  /* 3rd arg: 1st syscall arg */
        movq %rax,%rsi                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_X86_64,%edi    /* 1st arg: audit arch */
-       call audit_syscall_entry
+       call __audit_syscall_entry
        LOAD_ARGS 0             /* reload call-clobbered registers */
        jmp system_call_fastpath
 
        /*
-        * Return fast path for syscall audit.  Call audit_syscall_exit()
+        * Return fast path for syscall audit.  Call __audit_syscall_exit()
         * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
         * masked off.
         */
 sysret_audit:
        movq RAX-ARGOFFSET(%rsp),%rsi   /* second arg, syscall return value */
-       cmpq $0,%rsi            /* is it < 0? */
-       setl %al                /* 1 if so, 0 if not */
+       cmpq $-MAX_ERRNO,%rsi   /* is it < -MAX_ERRNO? */
+       setbe %al               /* 1 if so, 0 if not */
        movzbl %al,%edi         /* zero-extend that into %edi */
-       inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-       call audit_syscall_exit
+       call __audit_syscall_exit
        movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
        jmp sysret_check
 #endif /* CONFIG_AUDITSYSCALL */
index 89a04c7..5026738 100644 (file)
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->orig_ax);
 
-       if (unlikely(current->audit_context)) {
-               if (IS_IA32)
-                       audit_syscall_entry(AUDIT_ARCH_I386,
-                                           regs->orig_ax,
-                                           regs->bx, regs->cx,
-                                           regs->dx, regs->si);
+       if (IS_IA32)
+               audit_syscall_entry(AUDIT_ARCH_I386,
+                                   regs->orig_ax,
+                                   regs->bx, regs->cx,
+                                   regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-               else
-                       audit_syscall_entry(AUDIT_ARCH_X86_64,
-                                           regs->orig_ax,
-                                           regs->di, regs->si,
-                                           regs->dx, regs->r10);
+       else
+               audit_syscall_entry(AUDIT_ARCH_X86_64,
+                                   regs->orig_ax,
+                                   regs->di, regs->si,
+                                   regs->dx, regs->r10);
 #endif
-       }
 
        return ret ?: regs->orig_ax;
 }
@@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
        bool step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->ax);
index 863f875..af17e1c 100644 (file)
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
        if (info->flags & VM86_SCREEN_BITMAP)
                mark_screen_rdonly(tsk->mm);
 
-       /*call audit_syscall_exit since we do not exit via the normal paths */
+       /*call __audit_syscall_exit since we do not exit via the normal paths */
        if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(0), 0);
+               __audit_syscall_exit(1, 0);
 
        __asm__ __volatile__(
                "movl %0,%%esp\n\t"
index 711b162..5ef9344 100644 (file)
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+       return UPT_SYSCALL_RET(regs);
+}
index a0d042a..2dff698 100644 (file)
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
                do_syscall_trace();
 
 #if 0
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+       audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
index 163263d..ee55019 100644 (file)
@@ -3117,18 +3117,17 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-       struct cfq_queue *old_cfqq = cfqd->active_queue;
-
        cfq_log_cfqq(cfqd, cfqq, "preempt");
-       cfq_slice_expired(cfqd, 1);
 
        /*
         * workload type is changed, don't save slice, otherwise preempt
         * doesn't happen
         */
-       if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+       if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq))
                cfqq->cfqg->saved_workload_slice = 0;
 
+       cfq_slice_expired(cfqd, 1);
+
        /*
         * Put the new queue at the front of the of the current list,
         * so we know that it will be selected next.
index 32793ce..9c2cc46 100644 (file)
@@ -183,7 +183,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op)
        }
 
        irq = irq_of_parse_and_map(dn, 0);
-       if (irq == NO_IRQ) {
+       if (!irq) {
                printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__);
                rv = -EBUSY;
                goto err_irq;
index 3832e30..596e6a7 100644 (file)
@@ -221,7 +221,7 @@ static int register_balloon(struct device *dev)
 {
        int i, error;
 
-       error = bus_register(&balloon_subsys);
+       error = subsys_system_register(&balloon_subsys, NULL);
        if (error)
                return error;
 
index ecb9fd3..d33f01c 100644 (file)
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
 
          If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+       bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+       depends on BTRFS_FS
+       help
+         Adds code that examines all block write requests (including
+         writes of the super block). The goal is to verify that the
+         state of the filesystem on disk is always consistent, i.e.,
+         after a power-loss or kernel panic event the filesystem is
+         in a consistent state.
+
+         If the integrity check tool is included and activated in
+         the mount options, plenty of kernel memory is used, and
+         plenty of additional CPU cycles are spent. Enabling this
+         functionality is not intended for normal use.
+
+         In most cases, unless you are a btrfs developer who needs
+         to verify the integrity of (super)-block write requests
+         during the run of a regression test, say N
index c0ddfd2..0c4fa2b 100644 (file)
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o backref.o
+          reada.o backref.o ulist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
index 22c64ff..b9a8432 100644 (file)
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
 
-struct __data_ref {
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
        struct list_head list;
-       u64 inum;
-       u64 root;
-       u64 extent_data_item_offset;
+       u64 root_id;
+       struct btrfs_key key;
+       int level;
+       int count;
+       u64 parent;
+       u64 wanted_disk_byte;
 };
 
-struct __shared_ref {
-       struct list_head list;
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+                           struct btrfs_key *key, int level, u64 parent,
+                           u64 wanted_disk_byte, int count)
+{
+       struct __prelim_ref *ref;
+
+       /* in case we're adding delayed refs, we're holding the refs spinlock */
+       ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+       if (!ref)
+               return -ENOMEM;
+
+       ref->root_id = root_id;
+       if (key)
+               ref->key = *key;
+       else
+               memset(&ref->key, 0, sizeof(ref->key));
+
+       ref->level = level;
+       ref->count = count;
+       ref->parent = parent;
+       ref->wanted_disk_byte = wanted_disk_byte;
+       list_add_tail(&ref->list, head);
+
+       return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+                               struct ulist *parents,
+                               struct extent_buffer *eb, int level,
+                               u64 wanted_objectid, u64 wanted_disk_byte)
+{
+       int ret;
+       int slot;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
        u64 disk_byte;
-};
+
+add_parent:
+       ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+       if (ret < 0)
+               return ret;
+
+       if (level != 0)
+               return 0;
+
+       /*
+        * if the current leaf is full with EXTENT_DATA items, we must
+        * check the next one if that holds a reference as well.
+        * ref->count cannot be used to skip this check.
+        * repeat this until we don't find any additional EXTENT_DATA items.
+        */
+       while (1) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret < 0)
+                       return ret;
+               if (ret)
+                       return 0;
+
+               eb = path->nodes[0];
+               for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+                       btrfs_item_key_to_cpu(eb, &key, slot);
+                       if (key.objectid != wanted_objectid ||
+                           key.type != BTRFS_EXTENT_DATA_KEY)
+                               return 0;
+                       fi = btrfs_item_ptr(eb, slot,
+                                               struct btrfs_file_extent_item);
+                       disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+                       if (disk_byte == wanted_disk_byte)
+                               goto add_parent;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+                                       struct __prelim_ref *ref,
+                                       struct ulist *parents)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root;
+       struct btrfs_key root_key;
+       struct btrfs_key key = {0};
+       struct extent_buffer *eb;
+       int ret = 0;
+       int root_level;
+       int level = ref->level;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       root_key.objectid = ref->root_id;
+       root_key.type = BTRFS_ROOT_ITEM_KEY;
+       root_key.offset = (u64)-1;
+       root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
+
+       rcu_read_lock();
+       root_level = btrfs_header_level(root->node);
+       rcu_read_unlock();
+
+       if (root_level + 1 == level)
+               goto out;
+
+       path->lowest_level = level;
+       ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+       pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+                "%d for key (%llu %u %llu)\n",
+                (unsigned long long)ref->root_id, level, ref->count, ret,
+                (unsigned long long)ref->key.objectid, ref->key.type,
+                (unsigned long long)ref->key.offset);
+       if (ret < 0)
+               goto out;
+
+       eb = path->nodes[level];
+       if (!eb) {
+               WARN_ON(1);
+               ret = 1;
+               goto out;
+       }
+
+       if (level == 0) {
+               if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret)
+                               goto out;
+                       eb = path->nodes[0];
+               }
+
+               btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+       }
+
+       /* the last two parameters will only be used for level == 0 */
+       ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+                               ref->wanted_disk_byte);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+                                  struct list_head *head)
+{
+       int err;
+       int ret = 0;
+       struct __prelim_ref *ref;
+       struct __prelim_ref *ref_safe;
+       struct __prelim_ref *new_ref;
+       struct ulist *parents;
+       struct ulist_node *node;
+
+       parents = ulist_alloc(GFP_NOFS);
+       if (!parents)
+               return -ENOMEM;
+
+       /*
+        * _safe allows us to insert directly after the current item without
+        * iterating over the newly inserted items.
+        * we're also allowed to re-assign ref during iteration.
+        */
+       list_for_each_entry_safe(ref, ref_safe, head, list) {
+               if (ref->parent)        /* already direct */
+                       continue;
+               if (ref->count == 0)
+                       continue;
+               err = __resolve_indirect_ref(fs_info, ref, parents);
+               if (err) {
+                       if (ret == 0)
+                               ret = err;
+                       continue;
+               }
+
+               /* we put the first parent into the ref at hand */
+               node = ulist_next(parents, NULL);
+               ref->parent = node ? node->val : 0;
+
+               /* additional parents require new refs being added here */
+               while ((node = ulist_next(parents, node))) {
+                       new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+                       if (!new_ref) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+                       memcpy(new_ref, ref, sizeof(*ref));
+                       new_ref->parent = node->val;
+                       list_add(&new_ref->list, &ref->list);
+               }
+               ulist_reinit(parents);
+       }
+
+       ulist_free(parents);
+       return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+       struct list_head *pos1;
+
+       list_for_each(pos1, head) {
+               struct list_head *n2;
+               struct list_head *pos2;
+               struct __prelim_ref *ref1;
+
+               ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+               if (mode == 1 && ref1->key.type == 0)
+                       continue;
+               for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+                    pos2 = n2, n2 = pos2->next) {
+                       struct __prelim_ref *ref2;
+
+                       ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+                       if (mode == 1) {
+                               if (memcmp(&ref1->key, &ref2->key,
+                                          sizeof(ref1->key)) ||
+                                   ref1->level != ref2->level ||
+                                   ref1->root_id != ref2->root_id)
+                                       continue;
+                               ref1->count += ref2->count;
+                       } else {
+                               if (ref1->parent != ref2->parent)
+                                       continue;
+                               ref1->count += ref2->count;
+                       }
+                       list_del(&ref2->list);
+                       kfree(ref2);
+               }
+
+       }
+       return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+                             struct btrfs_key *info_key,
+                             struct list_head *prefs)
+{
+       struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+       struct rb_node *n = &head->node.rb_node;
+       int sgn;
+       int ret;
+
+       if (extent_op && extent_op->update_key)
+               btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+       while ((n = rb_prev(n))) {
+               struct btrfs_delayed_ref_node *node;
+               node = rb_entry(n, struct btrfs_delayed_ref_node,
+                               rb_node);
+               if (node->bytenr != head->node.bytenr)
+                       break;
+               WARN_ON(node->is_head);
+
+               if (node->seq > seq)
+                       continue;
+
+               switch (node->action) {
+               case BTRFS_ADD_DELAYED_EXTENT:
+               case BTRFS_UPDATE_DELAYED_HEAD:
+                       WARN_ON(1);
+                       continue;
+               case BTRFS_ADD_DELAYED_REF:
+                       sgn = 1;
+                       break;
+               case BTRFS_DROP_DELAYED_REF:
+                       sgn = -1;
+                       break;
+               default:
+                       BUG_ON(1);
+               }
+               switch (node->type) {
+               case BTRFS_TREE_BLOCK_REF_KEY: {
+                       struct btrfs_delayed_tree_ref *ref;
+
+                       ref = btrfs_delayed_node_to_tree_ref(node);
+                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                              ref->level + 1, 0, node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_SHARED_BLOCK_REF_KEY: {
+                       struct btrfs_delayed_tree_ref *ref;
+
+                       ref = btrfs_delayed_node_to_tree_ref(node);
+                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                              ref->level + 1, ref->parent,
+                                              node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_delayed_data_ref *ref;
+                       struct btrfs_key key;
+
+                       ref = btrfs_delayed_node_to_data_ref(node);
+
+                       key.objectid = ref->objectid;
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = ref->offset;
+                       ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+                                              node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_delayed_data_ref *ref;
+                       struct btrfs_key key;
+
+                       ref = btrfs_delayed_node_to_data_ref(node);
+
+                       key.objectid = ref->objectid;
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = ref->offset;
+                       ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+                                              ref->parent, node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+       }
+
+       return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+                            struct btrfs_path *path, u64 bytenr,
+                            struct btrfs_key *info_key, int *info_level,
+                            struct list_head *prefs)
+{
+       int ret;
+       int slot;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       unsigned long ptr;
+       unsigned long end;
+       struct btrfs_extent_item *ei;
+       u64 flags;
+       u64 item_size;
+
+       /*
+        * enumerate all inline refs
+        */
+       leaf = path->nodes[0];
+       slot = path->slots[0] - 1;
+
+       item_size = btrfs_item_size_nr(leaf, slot);
+       BUG_ON(item_size < sizeof(*ei));
+
+       ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+       flags = btrfs_extent_flags(leaf, ei);
+
+       ptr = (unsigned long)(ei + 1);
+       end = (unsigned long)ei + item_size;
+
+       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               struct btrfs_tree_block_info *info;
+               struct btrfs_disk_key disk_key;
+
+               info = (struct btrfs_tree_block_info *)ptr;
+               *info_level = btrfs_tree_block_level(leaf, info);
+               btrfs_tree_block_key(leaf, info, &disk_key);
+               btrfs_disk_key_to_cpu(info_key, &disk_key);
+               ptr += sizeof(struct btrfs_tree_block_info);
+               BUG_ON(ptr > end);
+       } else {
+               BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+       }
+
+       while (ptr < end) {
+               struct btrfs_extent_inline_ref *iref;
+               u64 offset;
+               int type;
+
+               iref = (struct btrfs_extent_inline_ref *)ptr;
+               type = btrfs_extent_inline_ref_type(leaf, iref);
+               offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+               switch (type) {
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, 0, info_key,
+                                               *info_level + 1, offset,
+                                               bytenr, 1);
+                       break;
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_shared_data_ref *sdref;
+                       int count;
+
+                       sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+                       count = btrfs_shared_data_ref_count(leaf, sdref);
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+                                              bytenr, count);
+                       break;
+               }
+               case BTRFS_TREE_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, offset, info_key,
+                                              *info_level + 1, 0, bytenr, 1);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_extent_data_ref *dref;
+                       int count;
+                       u64 root;
+
+                       dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                       count = btrfs_extent_data_ref_count(leaf, dref);
+                       key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                     dref);
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                       root = btrfs_extent_data_ref_root(leaf, dref);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+                                               count);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+               ptr += btrfs_extent_inline_ref_size(type);
+       }
+
+       return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+                           struct btrfs_path *path, u64 bytenr,
+                           struct btrfs_key *info_key, int info_level,
+                           struct list_head *prefs)
+{
+       struct btrfs_root *extent_root = fs_info->extent_root;
+       int ret;
+       int slot;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+
+       while (1) {
+               ret = btrfs_next_item(extent_root, path);
+               if (ret < 0)
+                       break;
+               if (ret) {
+                       ret = 0;
+                       break;
+               }
+
+               slot = path->slots[0];
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+
+               if (key.objectid != bytenr)
+                       break;
+               if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+                       continue;
+               if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+                       break;
+
+               switch (key.type) {
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, 0, info_key,
+                                               info_level + 1, key.offset,
+                                               bytenr, 1);
+                       break;
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_shared_data_ref *sdref;
+                       int count;
+
+                       sdref = btrfs_item_ptr(leaf, slot,
+                                             struct btrfs_shared_data_ref);
+                       count = btrfs_shared_data_ref_count(leaf, sdref);
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+                                               bytenr, count);
+                       break;
+               }
+               case BTRFS_TREE_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, key.offset, info_key,
+                                               info_level + 1, 0, bytenr, 1);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_extent_data_ref *dref;
+                       int count;
+                       u64 root;
+
+                       dref = btrfs_item_ptr(leaf, slot,
+                                             struct btrfs_extent_data_ref);
+                       count = btrfs_extent_data_ref_count(leaf, dref);
+                       key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                     dref);
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                       root = btrfs_extent_data_ref_root(leaf, dref);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                               bytenr, count);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+       }
+
+       return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info, u64 bytenr,
+                            u64 seq, struct ulist *refs, struct ulist *roots)
+{
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct btrfs_key info_key = { 0 };
+       struct btrfs_delayed_ref_root *delayed_refs = NULL;
+       struct btrfs_delayed_ref_head *head = NULL;
+       int info_level = 0;
+       int ret;
+       struct list_head prefs_delayed;
+       struct list_head prefs;
+       struct __prelim_ref *ref;
+
+       INIT_LIST_HEAD(&prefs);
+       INIT_LIST_HEAD(&prefs_delayed);
+
+       key.objectid = bytenr;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = (u64)-1;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /*
+        * grab both a lock on the path and a lock on the delayed ref head.
+        * We need both to get a consistent picture of how the refs look
+        * at a specified point in time
+        */
+again:
+       ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret == 0);
+
+       /*
+        * look if there are updates for this ref queued and lock the head
+        */
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (head) {
+               if (!mutex_trylock(&head->mutex)) {
+                       atomic_inc(&head->node.refs);
+                       spin_unlock(&delayed_refs->lock);
+
+                       btrfs_release_path(path);
+
+                       /*
+                        * Mutex was contended, block until it's
+                        * released and try again
+                        */
+                       mutex_lock(&head->mutex);
+                       mutex_unlock(&head->mutex);
+                       btrfs_put_delayed_ref(&head->node);
+                       goto again;
+               }
+               ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+               if (ret)
+                       goto out;
+       }
+       spin_unlock(&delayed_refs->lock);
+
+       if (path->slots[0]) {
+               struct extent_buffer *leaf;
+               int slot;
+
+               leaf = path->nodes[0];
+               slot = path->slots[0] - 1;
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid == bytenr &&
+                   key.type == BTRFS_EXTENT_ITEM_KEY) {
+                       ret = __add_inline_refs(fs_info, path, bytenr,
+                                               &info_key, &info_level, &prefs);
+                       if (ret)
+                               goto out;
+                       ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                                              info_level, &prefs);
+                       if (ret)
+                               goto out;
+               }
+       }
+       btrfs_release_path(path);
+
+       /*
+        * when adding the delayed refs above, the info_key might not have
+        * been known yet. Go over the list and replace the missing keys
+        */
+       list_for_each_entry(ref, &prefs_delayed, list) {
+               if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+                       memcpy(&ref->key, &info_key, sizeof(ref->key));
+       }
+       list_splice_init(&prefs_delayed, &prefs);
+
+       ret = __merge_refs(&prefs, 1);
+       if (ret)
+               goto out;
+
+       ret = __resolve_indirect_refs(fs_info, &prefs);
+       if (ret)
+               goto out;
+
+       ret = __merge_refs(&prefs, 2);
+       if (ret)
+               goto out;
+
+       while (!list_empty(&prefs)) {
+               ref = list_first_entry(&prefs, struct __prelim_ref, list);
+               list_del(&ref->list);
+               if (ref->count < 0)
+                       WARN_ON(1);
+               if (ref->count && ref->root_id && ref->parent == 0) {
+                       /* no parent == root of tree */
+                       ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+                       BUG_ON(ret < 0);
+               }
+               if (ref->count && ref->parent) {
+                       ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                       BUG_ON(ret < 0);
+               }
+               kfree(ref);
+       }
+
+out:
+       if (head)
+               mutex_unlock(&head->mutex);
+       btrfs_free_path(path);
+       while (!list_empty(&prefs)) {
+               ref = list_first_entry(&prefs, struct __prelim_ref, list);
+               list_del(&ref->list);
+               kfree(ref);
+       }
+       while (!list_empty(&prefs_delayed)) {
+               ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+                                      list);
+               list_del(&ref->list);
+               kfree(ref);
+       }
+
+       return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+       struct ulist *tmp;
+       int ret;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+       *leafs = ulist_alloc(GFP_NOFS);
+       if (!*leafs) {
+               ulist_free(tmp);
+               return -ENOMEM;
+       }
+
+       ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+       ulist_free(tmp);
+
+       if (ret < 0 && ret != -ENOENT) {
+               ulist_free(*leafs);
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **roots)
+{
+       struct ulist *tmp;
+       struct ulist_node *node = NULL;
+       int ret;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+       *roots = ulist_alloc(GFP_NOFS);
+       if (!*roots) {
+               ulist_free(tmp);
+               return -ENOMEM;
+       }
+
+       while (1) {
+               ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+                                       tmp, *roots);
+               if (ret < 0 && ret != -ENOENT) {
+                       ulist_free(tmp);
+                       ulist_free(*roots);
+                       return ret;
+               }
+               node = ulist_next(tmp, node);
+               if (!node)
+                       break;
+               bytenr = node->val;
+       }
+
+       ulist_free(tmp);
+       return 0;
+}
+
 
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
                        struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
        if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
            found_key->objectid > logical ||
-           found_key->objectid + found_key->offset <= logical)
+           found_key->objectid + found_key->offset <= logical) {
+               pr_debug("logical %llu is not within any extent\n",
+                        (unsigned long long)logical);
                return -ENOENT;
+       }
 
        eb = path->nodes[0];
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        flags = btrfs_extent_flags(eb, ei);
 
+       pr_debug("logical %llu is at position %llu within the extent (%llu "
+                "EXTENT_ITEM %llu) flags %#llx size %u\n",
+                (unsigned long long)logical,
+                (unsigned long long)(logical - found_key->objectid),
+                (unsigned long long)found_key->objectid,
+                (unsigned long long)found_key->offset,
+                (unsigned long long)flags, item_size);
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                return BTRFS_EXTENT_FLAG_TREE_BLOCK;
        if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        return 0;
 }
 
-static int __data_list_add(struct list_head *head, u64 inum,
-                               u64 extent_data_item_offset, u64 root)
-{
-       struct __data_ref *ref;
-
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
-       if (!ref)
-               return -ENOMEM;
-
-       ref->inum = inum;
-       ref->extent_data_item_offset = extent_data_item_offset;
-       ref->root = root;
-       list_add_tail(&ref->list, head);
-
-       return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-                               struct btrfs_extent_data_ref *dref)
-{
-       return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-                               btrfs_extent_data_ref_offset(eb, dref),
-                               btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-       struct __shared_ref *ref;
-
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
-       if (!ref)
-               return -ENOMEM;
-
-       ref->disk_byte = disk_byte;
-       list_add_tail(&ref->list, head);
-
-       return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-                                          u64 logical, u64 inum,
-                                          u64 extent_data_item_offset,
-                                          u64 extent_offset,
-                                          struct btrfs_path *path,
-                                          struct list_head *data_refs,
-                                          iterate_extent_inodes_t *iterate,
-                                          void *ctx)
-{
-       u64 ref_root;
-       u32 item_size;
-       struct btrfs_key key;
-       struct extent_buffer *eb;
-       struct btrfs_extent_item *ei;
-       struct btrfs_extent_inline_ref *eiref;
-       struct __data_ref *ref;
-       int ret;
-       int type;
-       int last;
-       unsigned long ptr = 0;
-
-       WARN_ON(!list_empty(data_refs));
-       ret = extent_from_logical(fs_info, logical, path, &key);
-       if (ret & BTRFS_EXTENT_FLAG_DATA)
-               ret = -EIO;
-       if (ret < 0)
-               goto out;
-
-       eb = path->nodes[0];
-       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-       item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-       ret = 0;
-       ref_root = 0;
-       /*
-        * as done in iterate_extent_inodes, we first build a list of refs to
-        * iterate, then free the path and then iterate them to avoid deadlocks.
-        */
-       do {
-               last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                               &eiref, &type);
-               if (last < 0) {
-                       ret = last;
-                       goto out;
-               }
-               if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-                   type == BTRFS_SHARED_BLOCK_REF_KEY) {
-                       ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-                       ret = __data_list_add(data_refs, inum,
-                                               extent_data_item_offset,
-                                               ref_root);
-               }
-       } while (!ret && !last);
-
-       btrfs_release_path(path);
-
-       if (ref_root == 0) {
-               printk(KERN_ERR "btrfs: failed to find tree block ref "
-                       "for shared data backref %llu\n", logical);
-               WARN_ON(1);
-               ret = -EIO;
-       }
-
-out:
-       while (!list_empty(data_refs)) {
-               ref = list_first_entry(data_refs, struct __data_ref, list);
-               list_del(&ref->list);
-               if (!ret)
-                       ret = iterate(ref->inum, extent_offset +
-                                       ref->extent_data_item_offset,
-                                       ref->root, ctx);
-               kfree(ref);
-       }
-
-       return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-                                   u64 logical, u64 orig_extent_item_objectid,
-                                   u64 extent_offset, struct btrfs_path *path,
-                                   struct list_head *data_refs,
-                                   iterate_extent_inodes_t *iterate,
-                                   void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+                               struct btrfs_path *path, u64 logical,
+                               u64 orig_extent_item_objectid,
+                               u64 extent_item_pos, u64 root,
+                               iterate_extent_inodes_t *iterate, void *ctx)
 {
        u64 disk_byte;
        struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
        struct extent_buffer *eb;
        int slot;
        int nritems;
-       int ret;
-       int found = 0;
+       int ret = 0;
+       int extent_type;
+       u64 data_offset;
+       u64 data_len;
 
        eb = read_tree_block(fs_info->tree_root, logical,
                                fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-               if (!fi) {
-                       free_extent_buffer(eb);
-                       return -EIO;
-               }
+               extent_type = btrfs_file_extent_type(eb, fi);
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
                disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-               if (disk_byte != orig_extent_item_objectid) {
-                       if (found)
-                               break;
-                       else
-                               continue;
-               }
-               ++found;
-               ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-                                                       key.objectid,
-                                                       key.offset,
-                                                       extent_offset, path,
-                                                       data_refs,
-                                                       iterate, ctx);
-               if (ret)
-                       break;
-       }
+               if (disk_byte != orig_extent_item_objectid)
+                       continue;
 
-       if (!found) {
-               printk(KERN_ERR "btrfs: failed to follow shared data backref "
-                       "to parent %llu\n", logical);
-               WARN_ON(1);
-               ret = -EIO;
+               data_offset = btrfs_file_extent_offset(eb, fi);
+               data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+               if (extent_item_pos < data_offset ||
+                   extent_item_pos >= data_offset + data_len)
+                       continue;
+
+               pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+                               "root %llu\n", orig_extent_item_objectid,
+                               key.objectid, key.offset, root);
+               ret = iterate(key.objectid,
+                               key.offset + (extent_item_pos - data_offset),
+                               root, ctx);
+               if (ret) {
+                       pr_debug("stopping iteration because ret=%d\n", ret);
+                       break;
+               }
        }
 
        free_extent_buffer(eb);
+
        return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                                struct btrfs_path *path,
-                               u64 extent_item_objectid,
-                               u64 extent_offset,
+                               u64 extent_item_objectid, u64 extent_item_pos,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
-       unsigned long ptr = 0;
-       int last;
        int ret;
-       int type;
-       u64 logical;
-       u32 item_size;
-       struct btrfs_extent_inline_ref *eiref;
-       struct btrfs_extent_data_ref *dref;
-       struct extent_buffer *eb;
-       struct btrfs_extent_item *ei;
-       struct btrfs_key key;
        struct list_head data_refs = LIST_HEAD_INIT(data_refs);
        struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-       struct __data_ref *ref_d;
-       struct __shared_ref *ref_s;
-
-       eb = path->nodes[0];
-       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-       item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-       /* first we iterate the inline refs, ... */
-       do {
-               last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                               &eiref, &type);
-               if (last == -ENOENT) {
-                       ret = 0;
-                       break;
-               }
-               if (last < 0) {
-                       ret = last;
-                       break;
-               }
+       struct btrfs_trans_handle *trans;
+       struct ulist *refs;
+       struct ulist *roots;
+       struct ulist_node *ref_node = NULL;
+       struct ulist_node *root_node = NULL;
+       struct seq_list seq_elem;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       pr_debug("resolving all inodes for extent %llu\n",
+                       extent_item_objectid);
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+       spin_unlock(&delayed_refs->lock);
+
+       ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+                                  extent_item_pos, seq_elem.seq,
+                                  &refs);
 
-               if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-                       dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-                       ret = __data_list_add_eb(&data_refs, eb, dref);
-               } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-                       logical = btrfs_extent_inline_ref_offset(eb, eiref);
-                       ret = __shared_list_add(&shared_refs, logical);
-               }
-       } while (!ret && !last);
+       if (ret)
+               goto out;
 
-       /* ... then we proceed to in-tree references and ... */
-       while (!ret) {
-               ++path->slots[0];
-               if (path->slots[0] > btrfs_header_nritems(eb)) {
-                       ret = btrfs_next_leaf(fs_info->extent_root, path);
-                       if (ret) {
-                               if (ret == 1)
-                                       ret = 0; /* we're done */
-                               break;
-                       }
-                       eb = path->nodes[0];
-               }
-               btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-               if (key.objectid != extent_item_objectid)
+       while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+                                               seq_elem.seq, &roots);
+               if (ret)
                        break;
-               if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-                       dref = btrfs_item_ptr(eb, path->slots[0],
-                                               struct btrfs_extent_data_ref);
-                       ret = __data_list_add_eb(&data_refs, eb, dref);
-               } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-                       ret = __shared_list_add(&shared_refs, key.offset);
+               while (!ret && (root_node = ulist_next(roots, root_node))) {
+                       pr_debug("root %llu references leaf %llu\n",
+                                       root_node->val, ref_node->val);
+                       ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+                                               extent_item_objectid,
+                                               extent_item_pos, root_node->val,
+                                               iterate, ctx);
                }
        }
 
-       btrfs_release_path(path);
-
-       /*
-        * ... only at the very end we can process the refs we found. this is
-        * because the iterator function we call is allowed to make tree lookups
-        * and we have to avoid deadlocks. additionally, we need more tree
-        * lookups ourselves for shared data refs.
-        */
-       while (!list_empty(&data_refs)) {
-               ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-               list_del(&ref_d->list);
-               if (!ret)
-                       ret = iterate(ref_d->inum, extent_offset +
-                                       ref_d->extent_data_item_offset,
-                                       ref_d->root, ctx);
-               kfree(ref_d);
-       }
-
-       while (!list_empty(&shared_refs)) {
-               ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-                                       list);
-               list_del(&ref_s->list);
-               if (!ret)
-                       ret = __iter_shared_inline_ref(fs_info,
-                                                       ref_s->disk_byte,
-                                                       extent_item_objectid,
-                                                       extent_offset, path,
-                                                       &data_refs,
-                                                       iterate, ctx);
-               kfree(ref_s);
-       }
-
+       ulist_free(refs);
+       ulist_free(roots);
+out:
+       btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+       btrfs_end_transaction(trans, fs_info->extent_root);
        return ret;
 }
 
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
        int ret;
-       u64 offset;
+       u64 extent_item_pos;
        struct btrfs_key found_key;
 
        ret = extent_from_logical(fs_info, logical, path,
                                        &found_key);
+       btrfs_release_path(path);
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -EINVAL;
        if (ret < 0)
                return ret;
 
-       offset = logical - found_key.objectid;
+       extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       offset, iterate, ctx);
+                                       extent_item_pos, iterate, ctx);
 
        return ret;
 }
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
                for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
                        name_len = btrfs_inode_ref_name_len(eb, iref);
                        /* path must be released before calling iterate()! */
+                       pr_debug("following ref at offset %u for inode %llu in "
+                                "tree %llu\n", cur,
+                                (unsigned long long)found_key.objectid,
+                                (unsigned long long)fs_root->objectid);
                        ret = iterate(parent, iref, eb, ctx);
                        if (ret) {
                                free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                return PTR_ERR(fspath);
 
        if (fspath > fspath_min) {
+               pr_debug("path resolved: %s\n", fspath);
                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
+               pr_debug("missed path, not enough space. missing bytes: %lu, "
+                        "constructed so far: %s\n",
+                        (unsigned long)(fspath_min - fspath), fspath_min);
                ++ipath->fspath->elem_missed;
                ipath->fspath->bytes_missing += fspath_min - fspath;
                ipath->fspath->bytes_left = 0;
index 9261883..d00dfa9 100644 (file)
@@ -20,6 +20,7 @@
 #define __BTRFS_BACKREF__
 
 #include "ioctl.h"
+#include "ulist.h"
 
 struct inode_fs_paths {
        struct btrfs_path               *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **roots);
+
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
                                        struct btrfs_path *path);
index 634608d..9b9b15f 100644 (file)
@@ -51,6 +51,9 @@ struct btrfs_inode {
        /* held while logging the inode in tree-log.c */
        struct mutex log_mutex;
 
+       /* held while doing delalloc reservations */
+       struct mutex delalloc_mutex;
+
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644 (file)
index 0000000..ad0b3ba
--- /dev/null
@@ -0,0 +1,3068 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and a FLUSH request to the device where
+ *        these blocks are located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)   /* in characters,
+                                                        * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE                    0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION                0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE                 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE                        0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH                       0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH                       0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE                             0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE                                0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE                                0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES                   0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE                    0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES                          0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS               0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+       u32 magic_num;          /* only used for debug purposes */
+       unsigned int is_metadata:1;     /* if it is meta-data, not data-data */
+       unsigned int is_superblock:1;   /* if it is one of the superblocks */
+       unsigned int is_iodone:1;       /* if is done by lower subsystem */
+       unsigned int iodone_w_error:1;  /* error was indicated to endio */
+       unsigned int never_written:1;   /* block was added because it was
+                                        * referenced, not because it was
+                                        * written */
+       unsigned int mirror_num:2;      /* large enough to hold
+                                        * BTRFS_SUPER_MIRROR_MAX */
+       struct btrfsic_dev_state *dev_state;
+       u64 dev_bytenr;         /* key, physical byte num on disk */
+       u64 logical_bytenr;     /* logical byte num on disk */
+       u64 generation;
+       struct btrfs_disk_key disk_key; /* extra info to print in case of
+                                        * issues, will not always be correct */
+       struct list_head collision_resolving_node;      /* list node */
+       struct list_head all_blocks_node;       /* list node */
+
+       /* the following two lists contain block_link items */
+       struct list_head ref_to_list;   /* list */
+       struct list_head ref_from_list; /* list */
+       struct btrfsic_block *next_in_same_bio;
+       void *orig_bio_bh_private;
+       union {
+               bio_end_io_t *bio;
+               bh_end_io_t *bh;
+       } orig_bio_bh_end_io;
+       int submit_bio_bh_rw;
+       u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+       u32 magic_num;          /* only used for debug purposes */
+       u32 ref_cnt;
+       struct list_head node_ref_to;   /* list node */
+       struct list_head node_ref_from; /* list node */
+       struct list_head collision_resolving_node;      /* list node */
+       struct btrfsic_block *block_ref_to;
+       struct btrfsic_block *block_ref_from;
+       u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+       u32 magic_num;          /* only used for debug purposes */
+       struct block_device *bdev;
+       struct btrfsic_state *state;
+       struct list_head collision_resolving_node;      /* list node */
+       struct btrfsic_block dummy_block_for_bio_bh_flush;
+       u64 last_flush_gen;
+       char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+       struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+       struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+       struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+       u64 start;              /* virtual bytenr */
+       u64 dev_bytenr;         /* physical bytenr on device */
+       u32 len;
+       struct btrfsic_dev_state *dev;
+       char *data;
+       struct buffer_head *bh; /* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+       u32 magic;
+       u32 nr;
+       int error;
+       int i;
+       int limit_nesting;
+       int num_copies;
+       int mirror_num;
+       struct btrfsic_block *block;
+       struct btrfsic_block_data_ctx *block_ctx;
+       struct btrfsic_block *next_block;
+       struct btrfsic_block_data_ctx next_block_ctx;
+       struct btrfs_header *hdr;
+       struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+       u32 print_mask;
+       int include_extent_data;
+       int csum_size;
+       struct list_head all_blocks_list;
+       struct btrfsic_block_hashtable block_hashtable;
+       struct btrfsic_block_link_hashtable block_link_hashtable;
+       struct btrfs_root *root;
+       u64 max_superblock_generation;
+       struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                       struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+               struct block_device *bdev,
+               u64 dev_bytenr,
+               struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+               struct btrfsic_block_link *l,
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+               struct block_device *bdev_ref_to,
+               u64 dev_bytenr_ref_to,
+               struct block_device *bdev_ref_from,
+               u64 dev_bytenr_ref_from,
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+               struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+               struct btrfsic_dev_state *ds,
+               struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+               struct block_device *bdev,
+               struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                     struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+                                    struct btrfsic_block *block,
+                                    struct btrfsic_block_data_ctx *block_ctx,
+                                    struct btrfs_header *hdr,
+                                    int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx
+               *block_ctx, u64 next_bytenr,
+               int limit_nesting,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block **next_blockp,
+               int force_iodone_flag,
+               int *num_copiesp, int *mirror_nump,
+               struct btrfs_disk_key *disk_key,
+               u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+                                     struct btrfsic_block *block,
+                                     struct btrfsic_block_data_ctx *block_ctx,
+                                     u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                            struct btrfsic_block_data_ctx *block_ctx_out,
+                            int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                 u32 len, struct block_device *bdev,
+                                 struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+                             struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                    const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                         u64 dev_bytenr, u8 *mapped_data,
+                                         unsigned int len, struct bio *bio,
+                                         int *bio_is_patched,
+                                         struct buffer_head *bh,
+                                         int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const block,
+               struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+                                             const struct btrfsic_block *block,
+                                             int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                       struct btrfsic_block *const block,
+                                       int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                 const struct btrfsic_block *block,
+                                 int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block *next_block,
+               struct btrfsic_block *from_block,
+               u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *block_ctx,
+               const char *additional_string,
+               int is_metadata,
+               int is_iodone,
+               int never_written,
+               int mirror_num,
+               int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+               struct btrfsic_state *state,
+               struct btrfsic_dev_state *dev_state,
+               struct btrfs_device *device,
+               int superblock_mirror_num,
+               struct btrfsic_dev_state **selected_dev_state,
+               struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+               struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                          u64 bytenr,
+                                          struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+       b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+       b->dev_state = NULL;
+       b->dev_bytenr = 0;
+       b->logical_bytenr = 0;
+       b->generation = BTRFSIC_GENERATION_UNKNOWN;
+       b->disk_key.objectid = 0;
+       b->disk_key.type = 0;
+       b->disk_key.offset = 0;
+       b->is_metadata = 0;
+       b->is_superblock = 0;
+       b->is_iodone = 0;
+       b->iodone_w_error = 0;
+       b->never_written = 0;
+       b->mirror_num = 0;
+       b->next_in_same_bio = NULL;
+       b->orig_bio_bh_private = NULL;
+       b->orig_bio_bh_end_io.bio = NULL;
+       INIT_LIST_HEAD(&b->collision_resolving_node);
+       INIT_LIST_HEAD(&b->all_blocks_node);
+       INIT_LIST_HEAD(&b->ref_to_list);
+       INIT_LIST_HEAD(&b->ref_from_list);
+       b->submit_bio_bh_rw = 0;
+       b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+       struct btrfsic_block *b;
+
+       b = kzalloc(sizeof(*b), GFP_NOFS);
+       if (NULL != b)
+               btrfsic_block_init(b);
+
+       return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+       BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+       kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+       l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+       l->ref_cnt = 1;
+       INIT_LIST_HEAD(&l->node_ref_to);
+       INIT_LIST_HEAD(&l->node_ref_from);
+       INIT_LIST_HEAD(&l->collision_resolving_node);
+       l->block_ref_to = NULL;
+       l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+       struct btrfsic_block_link *l;
+
+       l = kzalloc(sizeof(*l), GFP_NOFS);
+       if (NULL != l)
+               btrfsic_block_link_init(l);
+
+       return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+       BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+       kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+       ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+       ds->bdev = NULL;
+       ds->state = NULL;
+       ds->name[0] = '\0';
+       INIT_LIST_HEAD(&ds->collision_resolving_node);
+       ds->last_flush_gen = 0;
+       btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+       ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+       ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+       struct btrfsic_dev_state *ds;
+
+       ds = kzalloc(sizeof(*ds), GFP_NOFS);
+       if (NULL != ds)
+               btrfsic_dev_state_init(ds);
+
+       return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+       BUG_ON(!(NULL == ds ||
+                BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+       kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                       struct btrfsic_block_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(b->dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+            (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+       list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+       list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+               struct block_device *bdev,
+               u64 dev_bytenr,
+               struct btrfsic_block_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)bdev))) &
+            (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_block *const b =
+                   list_entry(elem, struct btrfsic_block,
+                              collision_resolving_node);
+
+               if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+                       return b;
+       }
+
+       return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+               struct btrfsic_block_link_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+               struct btrfsic_block_link *l,
+               struct btrfsic_block_link_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+            ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+            ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+            & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+       BUG_ON(NULL == l->block_ref_to);
+       BUG_ON(NULL == l->block_ref_from);
+       list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+       list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+               struct block_device *bdev_ref_to,
+               u64 dev_bytenr_ref_to,
+               struct block_device *bdev_ref_from,
+               u64 dev_bytenr_ref_from,
+               struct btrfsic_block_link_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+            ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+            ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+            ((unsigned int)((uintptr_t)bdev_ref_from))) &
+            (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_block_link *const l =
+                   list_entry(elem, struct btrfsic_block_link,
+                              collision_resolving_node);
+
+               BUG_ON(NULL == l->block_ref_to);
+               BUG_ON(NULL == l->block_ref_from);
+               if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+                   l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+                   l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+                   l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+                       return l;
+       }
+
+       return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+               struct btrfsic_dev_state_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+               struct btrfsic_dev_state *ds,
+               struct btrfsic_dev_state_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)((uintptr_t)ds->bdev)) &
+            (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+       list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+       list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+               struct block_device *bdev,
+               struct btrfsic_dev_state_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)((uintptr_t)bdev)) &
+            (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_dev_state *const ds =
+                   list_entry(elem, struct btrfsic_dev_state,
+                              collision_resolving_node);
+
+               if (ds->bdev == bdev)
+                       return ds;
+       }
+
+       return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                     struct btrfs_fs_devices *fs_devices)
+{
+       int ret;
+       struct btrfs_super_block *selected_super;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+       struct btrfsic_dev_state *selected_dev_state = NULL;
+       int pass;
+
+       BUG_ON(NULL == state);
+       selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+       if (NULL == selected_super) {
+               printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+               return -1;
+       }
+
+       list_for_each_entry(device, dev_head, dev_list) {
+               int i;
+               struct btrfsic_dev_state *dev_state;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               dev_state = btrfsic_dev_state_lookup(device->bdev);
+               BUG_ON(NULL == dev_state);
+               for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                       ret = btrfsic_process_superblock_dev_mirror(
+                                       state, dev_state, device, i,
+                                       &selected_dev_state, selected_super);
+                       if (0 != ret && 0 == i) {
+                               kfree(selected_super);
+                               return ret;
+                       }
+               }
+       }
+
+       if (NULL == state->latest_superblock) {
+               printk(KERN_INFO "btrfsic: no superblock found!\n");
+               kfree(selected_super);
+               return -1;
+       }
+
+       state->csum_size = btrfs_super_csum_size(selected_super);
+
+       for (pass = 0; pass < 3; pass++) {
+               int num_copies;
+               int mirror_num;
+               u64 next_bytenr;
+
+               switch (pass) {
+               case 0:
+                       next_bytenr = btrfs_super_root(selected_super);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "root@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 1:
+                       next_bytenr = btrfs_super_chunk_root(selected_super);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "chunk@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 2:
+                       next_bytenr = btrfs_super_log_root(selected_super);
+                       if (0 == next_bytenr)
+                               continue;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "log@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block *next_block;
+                       struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                       struct btrfsic_block_link *l;
+                       struct btrfs_header *hdr;
+
+                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                               &tmp_next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO "btrfsic:"
+                                      " btrfsic_map_block(root @%llu,"
+                                      " mirror %d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               kfree(selected_super);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_hashtable_lookup(
+                                       tmp_next_block_ctx.dev->bdev,
+                                       tmp_next_block_ctx.dev_bytenr,
+                                       &state->block_hashtable);
+                       BUG_ON(NULL == next_block);
+
+                       l = btrfsic_block_link_hashtable_lookup(
+                                       tmp_next_block_ctx.dev->bdev,
+                                       tmp_next_block_ctx.dev_bytenr,
+                                       state->latest_superblock->dev_state->
+                                       bdev,
+                                       state->latest_superblock->dev_bytenr,
+                                       &state->block_link_hashtable);
+                       BUG_ON(NULL == l);
+
+                       ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+                       if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                               printk(KERN_INFO
+                                      "btrfsic: read @logical %llu failed!\n",
+                                      (unsigned long long)
+                                      tmp_next_block_ctx.start);
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               kfree(selected_super);
+                               return -1;
+                       }
+
+                       hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+                       ret = btrfsic_process_metablock(state,
+                                                       next_block,
+                                                       &tmp_next_block_ctx,
+                                                       hdr,
+                                                       BTRFS_MAX_LEVEL + 3, 1);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+               }
+       }
+
+       kfree(selected_super);
+       return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+               struct btrfsic_state *state,
+               struct btrfsic_dev_state *dev_state,
+               struct btrfs_device *device,
+               int superblock_mirror_num,
+               struct btrfsic_dev_state **selected_dev_state,
+               struct btrfs_super_block *selected_super)
+{
+       struct btrfs_super_block *super_tmp;
+       u64 dev_bytenr;
+       struct buffer_head *bh;
+       struct btrfsic_block *superblock_tmp;
+       int pass;
+       struct block_device *const superblock_bdev = device->bdev;
+
+       /* super block bytenr is always the unmapped device bytenr */
+       dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+       bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+       if (NULL == bh)
+               return -1;
+       super_tmp = (struct btrfs_super_block *)
+           (bh->b_data + (dev_bytenr & 4095));
+
+       if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+           strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+                   sizeof(super_tmp->magic)) ||
+           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+               brelse(bh);
+               return 0;
+       }
+
+       superblock_tmp =
+           btrfsic_block_hashtable_lookup(superblock_bdev,
+                                          dev_bytenr,
+                                          &state->block_hashtable);
+       if (NULL == superblock_tmp) {
+               superblock_tmp = btrfsic_block_alloc();
+               if (NULL == superblock_tmp) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       brelse(bh);
+                       return -1;
+               }
+               /* for superblock, only the dev_bytenr makes sense */
+               superblock_tmp->dev_bytenr = dev_bytenr;
+               superblock_tmp->dev_state = dev_state;
+               superblock_tmp->logical_bytenr = dev_bytenr;
+               superblock_tmp->generation = btrfs_super_generation(super_tmp);
+               superblock_tmp->is_metadata = 1;
+               superblock_tmp->is_superblock = 1;
+               superblock_tmp->is_iodone = 1;
+               superblock_tmp->never_written = 0;
+               superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+                              " @%llu (%s/%llu/%d)\n",
+                              superblock_bdev, device->name,
+                              (unsigned long long)dev_bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              superblock_mirror_num);
+               list_add(&superblock_tmp->all_blocks_node,
+                        &state->all_blocks_list);
+               btrfsic_block_hashtable_add(superblock_tmp,
+                                           &state->block_hashtable);
+       }
+
+       /* select the one with the highest generation field */
+       if (btrfs_super_generation(super_tmp) >
+           state->max_superblock_generation ||
+           0 == state->max_superblock_generation) {
+               memcpy(selected_super, super_tmp, sizeof(*selected_super));
+               *selected_dev_state = dev_state;
+               state->max_superblock_generation =
+                   btrfs_super_generation(super_tmp);
+               state->latest_superblock = superblock_tmp;
+       }
+
+       for (pass = 0; pass < 3; pass++) {
+               u64 next_bytenr;
+               int num_copies;
+               int mirror_num;
+               const char *additional_string = NULL;
+               struct btrfs_disk_key tmp_disk_key;
+
+               tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+               tmp_disk_key.offset = 0;
+               switch (pass) {
+               case 0:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                       additional_string = "initial root ";
+                       next_bytenr = btrfs_super_root(super_tmp);
+                       break;
+               case 1:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                       additional_string = "initial chunk ";
+                       next_bytenr = btrfs_super_chunk_root(super_tmp);
+                       break;
+               case 2:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                       additional_string = "initial log ";
+                       next_bytenr = btrfs_super_log_root(super_tmp);
+                       if (0 == next_bytenr)
+                               continue;
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block *next_block;
+                       struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                       struct btrfsic_block_link *l;
+
+                       if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                             &tmp_next_block_ctx,
+                                             mirror_num)) {
+                               printk(KERN_INFO "btrfsic: btrfsic_map_block("
+                                      "bytenr @%llu, mirror %d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               brelse(bh);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state, &tmp_next_block_ctx,
+                                       additional_string, 1, 1, 0,
+                                       mirror_num, NULL);
+                       if (NULL == next_block) {
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               brelse(bh);
+                               return -1;
+                       }
+
+                       next_block->disk_key = tmp_disk_key;
+                       next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                       l = btrfsic_block_link_lookup_or_add(
+                                       state, &tmp_next_block_ctx,
+                                       next_block, superblock_tmp,
+                                       BTRFSIC_GENERATION_UNKNOWN);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                       if (NULL == l) {
+                               brelse(bh);
+                               return -1;
+                       }
+               }
+       }
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+               btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+       brelse(bh);
+       return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+       struct btrfsic_stack_frame *sf;
+
+       sf = kzalloc(sizeof(*sf), GFP_NOFS);
+       if (NULL == sf)
+               printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+       else
+               sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+       return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+       BUG_ON(!(NULL == sf ||
+                BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+       kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const first_block,
+               struct btrfsic_block_data_ctx *const first_block_ctx,
+               struct btrfs_header *const first_hdr,
+               int first_limit_nesting, int force_iodone_flag)
+{
+       struct btrfsic_stack_frame initial_stack_frame = { 0 };
+       struct btrfsic_stack_frame *sf;
+       struct btrfsic_stack_frame *next_stack;
+
+       sf = &initial_stack_frame;
+       sf->error = 0;
+       sf->i = -1;
+       sf->limit_nesting = first_limit_nesting;
+       sf->block = first_block;
+       sf->block_ctx = first_block_ctx;
+       sf->next_block = NULL;
+       sf->hdr = first_hdr;
+       sf->prev = NULL;
+
+continue_with_new_stack_frame:
+       sf->block->generation = le64_to_cpu(sf->hdr->generation);
+       if (0 == sf->hdr->level) {
+               struct btrfs_leaf *const leafhdr =
+                   (struct btrfs_leaf *)sf->hdr;
+
+               if (-1 == sf->i) {
+                       sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "leaf %llu items %d generation %llu"
+                                      " owner %llu\n",
+                                      (unsigned long long)
+                                      sf->block_ctx->start,
+                                      sf->nr,
+                                      (unsigned long long)
+                                      le64_to_cpu(leafhdr->header.generation),
+                                      (unsigned long long)
+                                      le64_to_cpu(leafhdr->header.owner));
+               }
+
+continue_with_current_leaf_stack_frame:
+               if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                       sf->i++;
+                       sf->num_copies = 0;
+               }
+
+               if (sf->i < sf->nr) {
+                       struct btrfs_item *disk_item = leafhdr->items + sf->i;
+                       struct btrfs_disk_key *disk_key = &disk_item->key;
+                       u8 type;
+                       const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+                       type = disk_key->type;
+
+                       if (BTRFS_ROOT_ITEM_KEY == type) {
+                               const struct btrfs_root_item *const root_item =
+                                   (struct btrfs_root_item *)
+                                   (sf->block_ctx->data +
+                                    offsetof(struct btrfs_leaf, items) +
+                                    item_offset);
+                               const u64 next_bytenr =
+                                   le64_to_cpu(root_item->bytenr);
+
+                               sf->error =
+                                   btrfsic_create_link_to_next_block(
+                                               state,
+                                               sf->block,
+                                               sf->block_ctx,
+                                               next_bytenr,
+                                               sf->limit_nesting,
+                                               &sf->next_block_ctx,
+                                               &sf->next_block,
+                                               force_iodone_flag,
+                                               &sf->num_copies,
+                                               &sf->mirror_num,
+                                               disk_key,
+                                               le64_to_cpu(root_item->
+                                               generation));
+                               if (sf->error)
+                                       goto one_stack_frame_backwards;
+
+                               if (NULL != sf->next_block) {
+                                       struct btrfs_header *const next_hdr =
+                                           (struct btrfs_header *)
+                                           sf->next_block_ctx.data;
+
+                                       next_stack =
+                                           btrfsic_stack_frame_alloc();
+                                       if (NULL == next_stack) {
+                                               btrfsic_release_block_ctx(
+                                                               &sf->
+                                                               next_block_ctx);
+                                               goto one_stack_frame_backwards;
+                                       }
+
+                                       next_stack->i = -1;
+                                       next_stack->block = sf->next_block;
+                                       next_stack->block_ctx =
+                                           &sf->next_block_ctx;
+                                       next_stack->next_block = NULL;
+                                       next_stack->hdr = next_hdr;
+                                       next_stack->limit_nesting =
+                                           sf->limit_nesting - 1;
+                                       next_stack->prev = sf;
+                                       sf = next_stack;
+                                       goto continue_with_new_stack_frame;
+                               }
+                       } else if (BTRFS_EXTENT_DATA_KEY == type &&
+                                  state->include_extent_data) {
+                               sf->error = btrfsic_handle_extent_data(
+                                               state,
+                                               sf->block,
+                                               sf->block_ctx,
+                                               item_offset,
+                                               force_iodone_flag);
+                               if (sf->error)
+                                       goto one_stack_frame_backwards;
+                       }
+
+                       goto continue_with_current_leaf_stack_frame;
+               }
+       } else {
+               struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+               if (-1 == sf->i) {
+                       sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "node %llu level %d items %d"
+                                      " generation %llu owner %llu\n",
+                                      (unsigned long long)
+                                      sf->block_ctx->start,
+                                      nodehdr->header.level, sf->nr,
+                                      (unsigned long long)
+                                      le64_to_cpu(nodehdr->header.generation),
+                                      (unsigned long long)
+                                      le64_to_cpu(nodehdr->header.owner));
+               }
+
+continue_with_current_node_stack_frame:
+               if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                       sf->i++;
+                       sf->num_copies = 0;
+               }
+
+               if (sf->i < sf->nr) {
+                       struct btrfs_key_ptr *disk_key_ptr =
+                           nodehdr->ptrs + sf->i;
+                       const u64 next_bytenr =
+                           le64_to_cpu(disk_key_ptr->blockptr);
+
+                       sf->error = btrfsic_create_link_to_next_block(
+                                       state,
+                                       sf->block,
+                                       sf->block_ctx,
+                                       next_bytenr,
+                                       sf->limit_nesting,
+                                       &sf->next_block_ctx,
+                                       &sf->next_block,
+                                       force_iodone_flag,
+                                       &sf->num_copies,
+                                       &sf->mirror_num,
+                                       &disk_key_ptr->key,
+                                       le64_to_cpu(disk_key_ptr->generation));
+                       if (sf->error)
+                               goto one_stack_frame_backwards;
+
+                       if (NULL != sf->next_block) {
+                               struct btrfs_header *const next_hdr =
+                                   (struct btrfs_header *)
+                                   sf->next_block_ctx.data;
+
+                               next_stack = btrfsic_stack_frame_alloc();
+                               if (NULL == next_stack)
+                                       goto one_stack_frame_backwards;
+
+                               next_stack->i = -1;
+                               next_stack->block = sf->next_block;
+                               next_stack->block_ctx = &sf->next_block_ctx;
+                               next_stack->next_block = NULL;
+                               next_stack->hdr = next_hdr;
+                               next_stack->limit_nesting =
+                                   sf->limit_nesting - 1;
+                               next_stack->prev = sf;
+                               sf = next_stack;
+                               goto continue_with_new_stack_frame;
+                       }
+
+                       goto continue_with_current_node_stack_frame;
+               }
+       }
+
+one_stack_frame_backwards:
+       if (NULL != sf->prev) {
+               struct btrfsic_stack_frame *const prev = sf->prev;
+
+               /* the one for the initial block is freed in the caller */
+               btrfsic_release_block_ctx(sf->block_ctx);
+
+               if (sf->error) {
+                       prev->error = sf->error;
+                       btrfsic_stack_frame_free(sf);
+                       sf = prev;
+                       goto one_stack_frame_backwards;
+               }
+
+               btrfsic_stack_frame_free(sf);
+               sf = prev;
+               goto continue_with_new_stack_frame;
+       } else {
+               BUG_ON(&initial_stack_frame != sf);
+       }
+
+       return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx *block_ctx,
+               u64 next_bytenr,
+               int limit_nesting,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block **next_blockp,
+               int force_iodone_flag,
+               int *num_copiesp, int *mirror_nump,
+               struct btrfs_disk_key *disk_key,
+               u64 parent_generation)
+{
+       struct btrfsic_block *next_block = NULL;
+       int ret;
+       struct btrfsic_block_link *l;
+       int did_alloc_block_link;
+       int block_was_created;
+
+       *next_blockp = NULL;
+       if (0 == *num_copiesp) {
+               *num_copiesp =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, *num_copiesp);
+               *mirror_nump = 1;
+       }
+
+       if (*mirror_nump > *num_copiesp)
+               return 0;
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+               printk(KERN_INFO
+                      "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+                      *mirror_nump);
+       ret = btrfsic_map_block(state, next_bytenr,
+                               BTRFSIC_BLOCK_SIZE,
+                               next_block_ctx, *mirror_nump);
+       if (ret) {
+               printk(KERN_INFO
+                      "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+                      (unsigned long long)next_bytenr, *mirror_nump);
+               btrfsic_release_block_ctx(next_block_ctx);
+               *next_blockp = NULL;
+               return -1;
+       }
+
+       next_block = btrfsic_block_lookup_or_add(state,
+                                                next_block_ctx, "referenced ",
+                                                1, force_iodone_flag,
+                                                !force_iodone_flag,
+                                                *mirror_nump,
+                                                &block_was_created);
+       if (NULL == next_block) {
+               btrfsic_release_block_ctx(next_block_ctx);
+               *next_blockp = NULL;
+               return -1;
+       }
+       if (block_was_created) {
+               l = NULL;
+               next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+       } else {
+               if (next_block->logical_bytenr != next_bytenr &&
+                   !(!next_block->is_metadata &&
+                     0 == next_block->logical_bytenr)) {
+                       printk(KERN_INFO
+                              "Referenced block @%llu (%s/%llu/%d)"
+                              " found in hash table, %c,"
+                              " bytenr mismatch (!= stored %llu).\n",
+                              (unsigned long long)next_bytenr,
+                              next_block_ctx->dev->name,
+                              (unsigned long long)next_block_ctx->dev_bytenr,
+                              *mirror_nump,
+                              btrfsic_get_block_type(state, next_block),
+                              (unsigned long long)next_block->logical_bytenr);
+               } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "Referenced block @%llu (%s/%llu/%d)"
+                              " found in hash table, %c.\n",
+                              (unsigned long long)next_bytenr,
+                              next_block_ctx->dev->name,
+                              (unsigned long long)next_block_ctx->dev_bytenr,
+                              *mirror_nump,
+                              btrfsic_get_block_type(state, next_block));
+               next_block->logical_bytenr = next_bytenr;
+
+               next_block->mirror_num = *mirror_nump;
+               l = btrfsic_block_link_hashtable_lookup(
+                               next_block_ctx->dev->bdev,
+                               next_block_ctx->dev_bytenr,
+                               block_ctx->dev->bdev,
+                               block_ctx->dev_bytenr,
+                               &state->block_link_hashtable);
+       }
+
+       next_block->disk_key = *disk_key;
+       if (NULL == l) {
+               l = btrfsic_block_link_alloc();
+               if (NULL == l) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       btrfsic_release_block_ctx(next_block_ctx);
+                       *next_blockp = NULL;
+                       return -1;
+               }
+
+               did_alloc_block_link = 1;
+               l->block_ref_to = next_block;
+               l->block_ref_from = block;
+               l->ref_cnt = 1;
+               l->parent_generation = parent_generation;
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+
+               list_add(&l->node_ref_to, &block->ref_to_list);
+               list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+               btrfsic_block_link_hashtable_add(l,
+                                                &state->block_link_hashtable);
+       } else {
+               did_alloc_block_link = 0;
+               if (0 == limit_nesting) {
+                       l->ref_cnt++;
+                       l->parent_generation = parent_generation;
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_add_link(state, l);
+               }
+       }
+
+       if (limit_nesting > 0 && did_alloc_block_link) {
+               ret = btrfsic_read_block(state, next_block_ctx);
+               if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                       printk(KERN_INFO
+                              "btrfsic: read block @logical %llu failed!\n",
+                              (unsigned long long)next_bytenr);
+                       btrfsic_release_block_ctx(next_block_ctx);
+                       *next_blockp = NULL;
+                       return -1;
+               }
+
+               *next_blockp = next_block;
+       } else {
+               *next_blockp = NULL;
+       }
+       (*mirror_nump)++;
+
+       return 0;
+}
+
+static int btrfsic_handle_extent_data(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx *block_ctx,
+               u32 item_offset, int force_iodone_flag)
+{
+       int ret;
+       struct btrfs_file_extent_item *file_extent_item =
+           (struct btrfs_file_extent_item *)(block_ctx->data +
+                                             offsetof(struct btrfs_leaf,
+                                                      items) + item_offset);
+       u64 next_bytenr =
+           le64_to_cpu(file_extent_item->disk_bytenr) +
+           le64_to_cpu(file_extent_item->offset);
+       u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+       u64 generation = le64_to_cpu(file_extent_item->generation);
+       struct btrfsic_block_link *l;
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+               printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+                      " offset = %llu, num_bytes = %llu\n",
+                      file_extent_item->type,
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->disk_bytenr),
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->offset),
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->num_bytes));
+       if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+           ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+               return 0;
+       while (num_bytes > 0) {
+               u32 chunk_len;
+               int num_copies;
+               int mirror_num;
+
+               if (num_bytes > BTRFSIC_BLOCK_SIZE)
+                       chunk_len = BTRFSIC_BLOCK_SIZE;
+               else
+                       chunk_len = num_bytes;
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block_data_ctx next_block_ctx;
+                       struct btrfsic_block *next_block;
+                       int block_was_created;
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "btrfsic_handle_extent_data("
+                                      "mirror_num=%d)\n", mirror_num);
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                               printk(KERN_INFO
+                                      "\tdisk_bytenr = %llu, num_bytes %u\n",
+                                      (unsigned long long)next_bytenr,
+                                      chunk_len);
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               chunk_len, &next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(@%llu,"
+                                      " mirror=%d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state,
+                                       &next_block_ctx,
+                                       "referenced ",
+                                       0,
+                                       force_iodone_flag,
+                                       !force_iodone_flag,
+                                       mirror_num,
+                                       &block_was_created);
+                       if (NULL == next_block) {
+                               printk(KERN_INFO
+                                      "btrfsic: error, kmalloc failed!\n");
+                               btrfsic_release_block_ctx(&next_block_ctx);
+                               return -1;
+                       }
+                       if (!block_was_created) {
+                               if (next_block->logical_bytenr != next_bytenr &&
+                                   !(!next_block->is_metadata &&
+                                     0 == next_block->logical_bytenr)) {
+                                       printk(KERN_INFO
+                                              "Referenced block"
+                                              " @%llu (%s/%llu/%d)"
+                                              " found in hash table, D,"
+                                              " bytenr mismatch"
+                                              " (!= stored %llu).\n",
+                                              (unsigned long long)next_bytenr,
+                                              next_block_ctx.dev->name,
+                                              (unsigned long long)
+                                              next_block_ctx.dev_bytenr,
+                                              mirror_num,
+                                              (unsigned long long)
+                                              next_block->logical_bytenr);
+                               }
+                               next_block->logical_bytenr = next_bytenr;
+                               next_block->mirror_num = mirror_num;
+                       }
+
+                       l = btrfsic_block_link_lookup_or_add(state,
+                                                            &next_block_ctx,
+                                                            next_block, block,
+                                                            generation);
+                       btrfsic_release_block_ctx(&next_block_ctx);
+                       if (NULL == l)
+                               return -1;
+               }
+
+               next_bytenr += chunk_len;
+               num_bytes -= chunk_len;
+       }
+
+       return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                            struct btrfsic_block_data_ctx *block_ctx_out,
+                            int mirror_num)
+{
+       int ret;
+       u64 length;
+       struct btrfs_bio *multi = NULL;
+       struct btrfs_device *device;
+
+       length = len;
+       ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+                             bytenr, &length, &multi, mirror_num);
+
+       device = multi->stripes[0].dev;
+       block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+       block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+       block_ctx_out->start = bytenr;
+       block_ctx_out->len = len;
+       block_ctx_out->data = NULL;
+       block_ctx_out->bh = NULL;
+
+       if (0 == ret)
+               kfree(multi);
+       if (NULL == block_ctx_out->dev) {
+               ret = -ENXIO;
+               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+       }
+
+       return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                 u32 len, struct block_device *bdev,
+                                 struct btrfsic_block_data_ctx *block_ctx_out)
+{
+       block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+       block_ctx_out->dev_bytenr = bytenr;
+       block_ctx_out->start = bytenr;
+       block_ctx_out->len = len;
+       block_ctx_out->data = NULL;
+       block_ctx_out->bh = NULL;
+       if (NULL != block_ctx_out->dev) {
+               return 0;
+       } else {
+               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+               return -ENXIO;
+       }
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+       if (NULL != block_ctx->bh) {
+               brelse(block_ctx->bh);
+               block_ctx->bh = NULL;
+       }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+                             struct btrfsic_block_data_ctx *block_ctx)
+{
+       block_ctx->bh = NULL;
+       if (block_ctx->dev_bytenr & 4095) {
+               printk(KERN_INFO
+                      "btrfsic: read_block() with unaligned bytenr %llu\n",
+                      (unsigned long long)block_ctx->dev_bytenr);
+               return -1;
+       }
+       if (block_ctx->len > 4096) {
+               printk(KERN_INFO
+                      "btrfsic: read_block() with too huge size %d\n",
+                      block_ctx->len);
+               return -1;
+       }
+
+       block_ctx->bh = __bread(block_ctx->dev->bdev,
+                               block_ctx->dev_bytenr >> 12, 4096);
+       if (NULL == block_ctx->bh)
+               return -1;
+       block_ctx->data = block_ctx->bh->b_data;
+
+       return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+       struct list_head *elem_all;
+
+       BUG_ON(NULL == state);
+
+       printk(KERN_INFO "all_blocks_list:\n");
+       list_for_each(elem_all, &state->all_blocks_list) {
+               const struct btrfsic_block *const b_all =
+                   list_entry(elem_all, struct btrfsic_block,
+                              all_blocks_node);
+               struct list_head *elem_ref_to;
+               struct list_head *elem_ref_from;
+
+               printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+                      btrfsic_get_block_type(state, b_all),
+                      (unsigned long long)b_all->logical_bytenr,
+                      b_all->dev_state->name,
+                      (unsigned long long)b_all->dev_bytenr,
+                      b_all->mirror_num);
+
+               list_for_each(elem_ref_to, &b_all->ref_to_list) {
+                       const struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                              " refers %u* to"
+                              " %c @%llu (%s/%llu/%d)\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+               }
+
+               list_for_each(elem_ref_from, &b_all->ref_from_list) {
+                       const struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_from,
+                                      struct btrfsic_block_link,
+                                      node_ref_from);
+
+                       printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                              " is ref %u* from"
+                              " %c @%llu (%s/%llu/%d)\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_from),
+                              (unsigned long long)
+                              l->block_ref_from->logical_bytenr,
+                              l->block_ref_from->dev_state->name,
+                              (unsigned long long)
+                              l->block_ref_from->dev_bytenr,
+                              l->block_ref_from->mirror_num);
+               }
+
+               printk(KERN_INFO "\n");
+       }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                    const u8 *data, unsigned int size)
+{
+       struct btrfs_header *h;
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       int fail = 0;
+       int crc_fail = 0;
+
+       h = (struct btrfs_header *)data;
+
+       if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+               fail++;
+
+       crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+       btrfs_csum_final(crc, csum);
+       if (memcmp(csum, h->csum, state->csum_size))
+               crc_fail++;
+
+       return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                         u64 dev_bytenr,
+                                         u8 *mapped_data, unsigned int len,
+                                         struct bio *bio,
+                                         int *bio_is_patched,
+                                         struct buffer_head *bh,
+                                         int submit_bio_bh_rw)
+{
+       int is_metadata;
+       struct btrfsic_block *block;
+       struct btrfsic_block_data_ctx block_ctx;
+       int ret;
+       struct btrfsic_state *state = dev_state->state;
+       struct block_device *bdev = dev_state->bdev;
+
+       WARN_ON(len > PAGE_SIZE);
+       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+       if (NULL != bio_is_patched)
+               *bio_is_patched = 0;
+
+       block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+                                              &state->block_hashtable);
+       if (NULL != block) {
+               u64 bytenr;
+               struct list_head *elem_ref_to;
+               struct list_head *tmp_ref_to;
+
+               if (block->is_superblock) {
+                       bytenr = le64_to_cpu(((struct btrfs_super_block *)
+                                             mapped_data)->bytenr);
+                       is_metadata = 1;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+                               printk(KERN_INFO
+                                      "[before new superblock is written]:\n");
+                               btrfsic_dump_tree_sub(state, block, 0);
+                       }
+               }
+               if (is_metadata) {
+                       if (!block->is_superblock) {
+                               bytenr = le64_to_cpu(((struct btrfs_header *)
+                                                     mapped_data)->bytenr);
+                               btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+                                                              dev_state,
+                                                              dev_bytenr,
+                                                              mapped_data);
+                       }
+                       if (block->logical_bytenr != bytenr) {
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c,"
+                                      " bytenr mismatch"
+                                      " (!= stored %llu).\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block),
+                                      (unsigned long long)
+                                      block->logical_bytenr);
+                               block->logical_bytenr = bytenr;
+                       } else if (state->print_mask &
+                                  BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block));
+               } else {
+                       bytenr = block->logical_bytenr;
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block));
+               }
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "ref_to_list: %cE, ref_from_list: %cE\n",
+                              list_empty(&block->ref_to_list) ? ' ' : '!',
+                              list_empty(&block->ref_from_list) ? ' ' : '!');
+               if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+                       printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                              " @%llu (%s/%llu/%d), old(gen=%llu,"
+                              " objectid=%llu, type=%d, offset=%llu),"
+                              " new(gen=%llu),"
+                              " which is referenced by most recent superblock"
+                              " (superblockgen=%llu)!\n",
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              block->mirror_num,
+                              (unsigned long long)block->generation,
+                              (unsigned long long)
+                              le64_to_cpu(block->disk_key.objectid),
+                              block->disk_key.type,
+                              (unsigned long long)
+                              le64_to_cpu(block->disk_key.offset),
+                              (unsigned long long)
+                              le64_to_cpu(((struct btrfs_header *)
+                                           mapped_data)->generation),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+                       btrfsic_dump_tree(state);
+               }
+
+               if (!block->is_iodone && !block->never_written) {
+                       printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                              " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+                              " which is not yet iodone!\n",
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              block->mirror_num,
+                              (unsigned long long)block->generation,
+                              (unsigned long long)
+                              le64_to_cpu(((struct btrfs_header *)
+                                           mapped_data)->generation));
+                       /* it would not be safe to go on */
+                       btrfsic_dump_tree(state);
+                       return;
+               }
+
+               /*
+                * Clear all references of this block. Do not free
+                * the block itself even if is not referenced anymore
+                * because it still carries valueable information
+                * like whether it was ever written and IO completed.
+                */
+               list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                  &block->ref_to_list) {
+                       struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_rem_link(state, l);
+                       l->ref_cnt--;
+                       if (0 == l->ref_cnt) {
+                               list_del(&l->node_ref_to);
+                               list_del(&l->node_ref_from);
+                               btrfsic_block_link_hashtable_remove(l);
+                               btrfsic_block_link_free(l);
+                       }
+               }
+
+               if (block->is_superblock)
+                       ret = btrfsic_map_superblock(state, bytenr, len,
+                                                    bdev, &block_ctx);
+               else
+                       ret = btrfsic_map_block(state, bytenr, len,
+                                               &block_ctx, 0);
+               if (ret) {
+                       printk(KERN_INFO
+                              "btrfsic: btrfsic_map_block(root @%llu)"
+                              " failed!\n", (unsigned long long)bytenr);
+                       return;
+               }
+               block_ctx.data = mapped_data;
+               /* the following is required in case of writes to mirrors,
+                * use the same that was used for the lookup */
+               block_ctx.dev = dev_state;
+               block_ctx.dev_bytenr = dev_bytenr;
+
+               if (is_metadata || state->include_extent_data) {
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       if (NULL != bio) {
+                               block->is_iodone = 0;
+                               BUG_ON(NULL == bio_is_patched);
+                               if (!*bio_is_patched) {
+                                       block->orig_bio_bh_private =
+                                           bio->bi_private;
+                                       block->orig_bio_bh_end_io.bio =
+                                           bio->bi_end_io;
+                                       block->next_in_same_bio = NULL;
+                                       bio->bi_private = block;
+                                       bio->bi_end_io = btrfsic_bio_end_io;
+                                       *bio_is_patched = 1;
+                               } else {
+                                       struct btrfsic_block *chained_block =
+                                           (struct btrfsic_block *)
+                                           bio->bi_private;
+
+                                       BUG_ON(NULL == chained_block);
+                                       block->orig_bio_bh_private =
+                                           chained_block->orig_bio_bh_private;
+                                       block->orig_bio_bh_end_io.bio =
+                                           chained_block->orig_bio_bh_end_io.
+                                           bio;
+                                       block->next_in_same_bio = chained_block;
+                                       bio->bi_private = block;
+                               }
+                       } else if (NULL != bh) {
+                               block->is_iodone = 0;
+                               block->orig_bio_bh_private = bh->b_private;
+                               block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                               block->next_in_same_bio = NULL;
+                               bh->b_private = block;
+                               bh->b_end_io = btrfsic_bh_end_io;
+                       } else {
+                               block->is_iodone = 1;
+                               block->orig_bio_bh_private = NULL;
+                               block->orig_bio_bh_end_io.bio = NULL;
+                               block->next_in_same_bio = NULL;
+                       }
+               }
+
+               block->flush_gen = dev_state->last_flush_gen + 1;
+               block->submit_bio_bh_rw = submit_bio_bh_rw;
+               if (is_metadata) {
+                       block->logical_bytenr = bytenr;
+                       block->is_metadata = 1;
+                       if (block->is_superblock) {
+                               ret = btrfsic_process_written_superblock(
+                                               state,
+                                               block,
+                                               (struct btrfs_super_block *)
+                                               mapped_data);
+                               if (state->print_mask &
+                                   BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+                                       printk(KERN_INFO
+                                       "[after new superblock is written]:\n");
+                                       btrfsic_dump_tree_sub(state, block, 0);
+                               }
+                       } else {
+                               block->mirror_num = 0;  /* unknown */
+                               ret = btrfsic_process_metablock(
+                                               state,
+                                               block,
+                                               &block_ctx,
+                                               (struct btrfs_header *)
+                                               block_ctx.data,
+                                               0, 0);
+                       }
+                       if (ret)
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_process_metablock"
+                                      "(root @%llu) failed!\n",
+                                      (unsigned long long)dev_bytenr);
+               } else {
+                       block->is_metadata = 0;
+                       block->mirror_num = 0;  /* unknown */
+                       block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                       if (!state->include_extent_data
+                           && list_empty(&block->ref_from_list)) {
+                               /*
+                                * disk block is overwritten with extent
+                                * data (not meta data) and we are configured
+                                * to not include extent data: take the
+                                * chance and free the block's memory
+                                */
+                               btrfsic_block_hashtable_remove(block);
+                               list_del(&block->all_blocks_node);
+                               btrfsic_block_free(block);
+                       }
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       } else {
+               /* block has not been found in hash table */
+               u64 bytenr;
+
+               if (!is_metadata) {
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "Written block (%s/%llu/?)"
+                                      " !found in hash table, D.\n",
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr);
+                       if (!state->include_extent_data)
+                               return; /* ignore that written D block */
+
+                       /* this is getting ugly for the
+                        * include_extent_data case... */
+                       bytenr = 0;     /* unknown */
+                       block_ctx.start = bytenr;
+                       block_ctx.len = len;
+                       block_ctx.bh = NULL;
+               } else {
+                       bytenr = le64_to_cpu(((struct btrfs_header *)
+                                             mapped_data)->bytenr);
+                       btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+                                                      dev_bytenr,
+                                                      mapped_data);
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/?)"
+                                      " !found in hash table, M.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr);
+
+                       ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+                                               0);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(root @%llu)"
+                                      " failed!\n",
+                                      (unsigned long long)dev_bytenr);
+                               return;
+                       }
+               }
+               block_ctx.data = mapped_data;
+               /* the following is required in case of writes to mirrors,
+                * use the same that was used for the lookup */
+               block_ctx.dev = dev_state;
+               block_ctx.dev_bytenr = dev_bytenr;
+
+               block = btrfsic_block_alloc();
+               if (NULL == block) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       btrfsic_release_block_ctx(&block_ctx);
+                       return;
+               }
+               block->dev_state = dev_state;
+               block->dev_bytenr = dev_bytenr;
+               block->logical_bytenr = bytenr;
+               block->is_metadata = is_metadata;
+               block->never_written = 0;
+               block->iodone_w_error = 0;
+               block->mirror_num = 0;  /* unknown */
+               block->flush_gen = dev_state->last_flush_gen + 1;
+               block->submit_bio_bh_rw = submit_bio_bh_rw;
+               if (NULL != bio) {
+                       block->is_iodone = 0;
+                       BUG_ON(NULL == bio_is_patched);
+                       if (!*bio_is_patched) {
+                               block->orig_bio_bh_private = bio->bi_private;
+                               block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                               block->next_in_same_bio = NULL;
+                               bio->bi_private = block;
+                               bio->bi_end_io = btrfsic_bio_end_io;
+                               *bio_is_patched = 1;
+                       } else {
+                               struct btrfsic_block *chained_block =
+                                   (struct btrfsic_block *)
+                                   bio->bi_private;
+
+                               BUG_ON(NULL == chained_block);
+                               block->orig_bio_bh_private =
+                                   chained_block->orig_bio_bh_private;
+                               block->orig_bio_bh_end_io.bio =
+                                   chained_block->orig_bio_bh_end_io.bio;
+                               block->next_in_same_bio = chained_block;
+                               bio->bi_private = block;
+                       }
+               } else if (NULL != bh) {
+                       block->is_iodone = 0;
+                       block->orig_bio_bh_private = bh->b_private;
+                       block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                       block->next_in_same_bio = NULL;
+                       bh->b_private = block;
+                       bh->b_end_io = btrfsic_bh_end_io;
+               } else {
+                       block->is_iodone = 1;
+                       block->orig_bio_bh_private = NULL;
+                       block->orig_bio_bh_end_io.bio = NULL;
+                       block->next_in_same_bio = NULL;
+               }
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "New written %c-block @%llu (%s/%llu/%d)\n",
+                              is_metadata ? 'M' : 'D',
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num);
+               list_add(&block->all_blocks_node, &state->all_blocks_list);
+               btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+               if (is_metadata) {
+                       ret = btrfsic_process_metablock(state, block,
+                                                       &block_ctx,
+                                                       (struct btrfs_header *)
+                                                       block_ctx.data, 0, 0);
+                       if (ret)
+                               printk(KERN_INFO
+                                      "btrfsic: process_metablock(root @%llu)"
+                                      " failed!\n",
+                                      (unsigned long long)dev_bytenr);
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       }
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+       struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+       int iodone_w_error;
+
+       /* mutex is not held! This is not save if IO is not yet completed
+        * on umount */
+       iodone_w_error = 0;
+       if (bio_error_status)
+               iodone_w_error = 1;
+
+       BUG_ON(NULL == block);
+       bp->bi_private = block->orig_bio_bh_private;
+       bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+       do {
+               struct btrfsic_block *next_block;
+               struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+               if ((dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                       printk(KERN_INFO
+                              "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+                              bio_error_status,
+                              btrfsic_get_block_type(dev_state->state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num);
+               next_block = block->next_in_same_bio;
+               block->iodone_w_error = iodone_w_error;
+               if (block->submit_bio_bh_rw & REQ_FLUSH) {
+                       dev_state->last_flush_gen++;
+                       if ((dev_state->state->print_mask &
+                            BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                               printk(KERN_INFO
+                                      "bio_end_io() new %s flush_gen=%llu\n",
+                                      dev_state->name,
+                                      (unsigned long long)
+                                      dev_state->last_flush_gen);
+               }
+               if (block->submit_bio_bh_rw & REQ_FUA)
+                       block->flush_gen = 0; /* FUA completed means block is
+                                              * on disk */
+               block->is_iodone = 1; /* for FLUSH, this releases the block */
+               block = next_block;
+       } while (NULL != block);
+
+       bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+       struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+       int iodone_w_error = !uptodate;
+       struct btrfsic_dev_state *dev_state;
+
+       BUG_ON(NULL == block);
+       dev_state = block->dev_state;
+       if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+               printk(KERN_INFO
+                      "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+                      iodone_w_error,
+                      btrfsic_get_block_type(dev_state->state, block),
+                      (unsigned long long)block->logical_bytenr,
+                      block->dev_state->name,
+                      (unsigned long long)block->dev_bytenr,
+                      block->mirror_num);
+
+       block->iodone_w_error = iodone_w_error;
+       if (block->submit_bio_bh_rw & REQ_FLUSH) {
+               dev_state->last_flush_gen++;
+               if ((dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                       printk(KERN_INFO
+                              "bh_end_io() new %s flush_gen=%llu\n",
+                              dev_state->name,
+                              (unsigned long long)dev_state->last_flush_gen);
+       }
+       if (block->submit_bio_bh_rw & REQ_FUA)
+               block->flush_gen = 0; /* FUA completed means block is on disk */
+
+       bh->b_private = block->orig_bio_bh_private;
+       bh->b_end_io = block->orig_bio_bh_end_io.bh;
+       block->is_iodone = 1; /* for FLUSH, this releases the block */
+       bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const superblock,
+               struct btrfs_super_block *const super_hdr)
+{
+       int pass;
+
+       superblock->generation = btrfs_super_generation(super_hdr);
+       if (!(superblock->generation > state->max_superblock_generation ||
+             0 == state->max_superblock_generation)) {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO
+                              "btrfsic: superblock @%llu (%s/%llu/%d)"
+                              " with old gen %llu <= %llu\n",
+                              (unsigned long long)superblock->logical_bytenr,
+                              superblock->dev_state->name,
+                              (unsigned long long)superblock->dev_bytenr,
+                              superblock->mirror_num,
+                              (unsigned long long)
+                              btrfs_super_generation(super_hdr),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+       } else {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO
+                              "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+                              " with new gen %llu > %llu\n",
+                              (unsigned long long)superblock->logical_bytenr,
+                              superblock->dev_state->name,
+                              (unsigned long long)superblock->dev_bytenr,
+                              superblock->mirror_num,
+                              (unsigned long long)
+                              btrfs_super_generation(super_hdr),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+
+               state->max_superblock_generation =
+                   btrfs_super_generation(super_hdr);
+               state->latest_superblock = superblock;
+       }
+
+       for (pass = 0; pass < 3; pass++) {
+               int ret;
+               u64 next_bytenr;
+               struct btrfsic_block *next_block;
+               struct btrfsic_block_data_ctx tmp_next_block_ctx;
+               struct btrfsic_block_link *l;
+               int num_copies;
+               int mirror_num;
+               const char *additional_string = NULL;
+               struct btrfs_disk_key tmp_disk_key;
+
+               tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+               tmp_disk_key.offset = 0;
+
+               switch (pass) {
+               case 0:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                       additional_string = "root ";
+                       next_bytenr = btrfs_super_root(super_hdr);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "root@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 1:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                       additional_string = "chunk ";
+                       next_bytenr = btrfs_super_chunk_root(super_hdr);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "chunk@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 2:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                       additional_string = "log ";
+                       next_bytenr = btrfs_super_log_root(super_hdr);
+                       if (0 == next_bytenr)
+                               continue;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "log@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       int was_created;
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "btrfsic_process_written_superblock("
+                                      "mirror_num=%d)\n", mirror_num);
+                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                               &tmp_next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(@%llu,"
+                                      " mirror=%d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state,
+                                       &tmp_next_block_ctx,
+                                       additional_string,
+                                       1, 0, 1,
+                                       mirror_num,
+                                       &was_created);
+                       if (NULL == next_block) {
+                               printk(KERN_INFO
+                                      "btrfsic: error, kmalloc failed!\n");
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               return -1;
+                       }
+
+                       next_block->disk_key = tmp_disk_key;
+                       if (was_created)
+                               next_block->generation =
+                                   BTRFSIC_GENERATION_UNKNOWN;
+                       l = btrfsic_block_link_lookup_or_add(
+                                       state,
+                                       &tmp_next_block_ctx,
+                                       next_block,
+                                       superblock,
+                                       BTRFSIC_GENERATION_UNKNOWN);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                       if (NULL == l)
+                               return -1;
+               }
+       }
+
+       if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+               WARN_ON(1);
+               btrfsic_dump_tree(state);
+       }
+
+       return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                       struct btrfsic_block *const block,
+                                       int recursion_level)
+{
+       struct list_head *elem_ref_to;
+       int ret = 0;
+
+       if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+               /*
+                * Note that this situation can happen and does not
+                * indicate an error in regular cases. It happens
+                * when disk blocks are freed and later reused.
+                * The check-integrity module is not aware of any
+                * block free operations, it just recognizes block
+                * write operations. Therefore it keeps the linkage
+                * information for a block until a block is
+                * rewritten. This can temporarily cause incorrect
+                * and even circular linkage informations. This
+                * causes no harm unless such blocks are referenced
+                * by the most recent super block.
+                */
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "btrfsic: abort cyclic linkage (case 1).\n");
+
+               return ret;
+       }
+
+       /*
+        * This algorithm is recursive because the amount of used stack
+        * space is very small and the max recursion depth is limited.
+        */
+       list_for_each(elem_ref_to, &block->ref_to_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_to, struct btrfsic_block_link,
+                              node_ref_to);
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "rl=%d, %c @%llu (%s/%llu/%d)"
+                              " %u* refers to %c @%llu (%s/%llu/%d)\n",
+                              recursion_level,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+               if (l->block_ref_to->never_written) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is never written!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
+               } else if (!l->block_ref_to->is_iodone) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is not yet iodone!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
+               } else if (l->parent_generation !=
+                          l->block_ref_to->generation &&
+                          BTRFSIC_GENERATION_UNKNOWN !=
+                          l->parent_generation &&
+                          BTRFSIC_GENERATION_UNKNOWN !=
+                          l->block_ref_to->generation) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " with generation %llu !="
+                              " parent generation %llu!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num,
+                              (unsigned long long)l->block_ref_to->generation,
+                              (unsigned long long)l->parent_generation);
+                       ret = -1;
+               } else if (l->block_ref_to->flush_gen >
+                          l->block_ref_to->dev_state->last_flush_gen) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is not flushed out of disk's write cache"
+                              " (block flush_gen=%llu,"
+                              " dev->flush_gen=%llu)!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num,
+                              (unsigned long long)block->flush_gen,
+                              (unsigned long long)
+                              l->block_ref_to->dev_state->last_flush_gen);
+                       ret = -1;
+               } else if (-1 == btrfsic_check_all_ref_blocks(state,
+                                                             l->block_ref_to,
+                                                             recursion_level +
+                                                             1)) {
+                       ret = -1;
+               }
+       }
+
+       return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+               const struct btrfsic_state *state,
+               const struct btrfsic_block *block,
+               int recursion_level)
+{
+       struct list_head *elem_ref_from;
+
+       if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+               /* refer to comment at "abort cyclic linkage (case 1)" */
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "btrfsic: abort cyclic linkage (case 2).\n");
+
+               return 0;
+       }
+
+       /*
+        * This algorithm is recursive because the amount of used stack space
+        * is very small and the max recursion depth is limited.
+        */
+       list_for_each(elem_ref_from, &block->ref_from_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_from, struct btrfsic_block_link,
+                              node_ref_from);
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "rl=%d, %c @%llu (%s/%llu/%d)"
+                              " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                              recursion_level,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_from),
+                              (unsigned long long)
+                              l->block_ref_from->logical_bytenr,
+                              l->block_ref_from->dev_state->name,
+                              (unsigned long long)
+                              l->block_ref_from->dev_bytenr,
+                              l->block_ref_from->mirror_num);
+               if (l->block_ref_from->is_superblock &&
+                   state->latest_superblock->dev_bytenr ==
+                   l->block_ref_from->dev_bytenr &&
+                   state->latest_superblock->dev_state->bdev ==
+                   l->block_ref_from->dev_state->bdev)
+                       return 1;
+               else if (btrfsic_is_block_ref_by_superblock(state,
+                                                           l->block_ref_from,
+                                                           recursion_level +
+                                                           1))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l)
+{
+       printk(KERN_INFO
+              "Add %u* link from %c @%llu (%s/%llu/%d)"
+              " to %c @%llu (%s/%llu/%d).\n",
+              l->ref_cnt,
+              btrfsic_get_block_type(state, l->block_ref_from),
+              (unsigned long long)l->block_ref_from->logical_bytenr,
+              l->block_ref_from->dev_state->name,
+              (unsigned long long)l->block_ref_from->dev_bytenr,
+              l->block_ref_from->mirror_num,
+              btrfsic_get_block_type(state, l->block_ref_to),
+              (unsigned long long)l->block_ref_to->logical_bytenr,
+              l->block_ref_to->dev_state->name,
+              (unsigned long long)l->block_ref_to->dev_bytenr,
+              l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l)
+{
+       printk(KERN_INFO
+              "Rem %u* link from %c @%llu (%s/%llu/%d)"
+              " to %c @%llu (%s/%llu/%d).\n",
+              l->ref_cnt,
+              btrfsic_get_block_type(state, l->block_ref_from),
+              (unsigned long long)l->block_ref_from->logical_bytenr,
+              l->block_ref_from->dev_state->name,
+              (unsigned long long)l->block_ref_from->dev_bytenr,
+              l->block_ref_from->mirror_num,
+              btrfsic_get_block_type(state, l->block_ref_to),
+              (unsigned long long)l->block_ref_to->logical_bytenr,
+              l->block_ref_to->dev_state->name,
+              (unsigned long long)l->block_ref_to->dev_bytenr,
+              l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block)
+{
+       if (block->is_superblock &&
+           state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+           state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+               return 'S';
+       else if (block->is_superblock)
+               return 's';
+       else if (block->is_metadata)
+               return 'M';
+       else
+               return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+       btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                 const struct btrfsic_block *block,
+                                 int indent_level)
+{
+       struct list_head *elem_ref_to;
+       int indent_add;
+       static char buf[80];
+       int cursor_position;
+
+       /*
+        * Should better fill an on-stack buffer with a complete line and
+        * dump it at once when it is time to print a newline character.
+        */
+
+       /*
+        * This algorithm is recursive because the amount of used stack space
+        * is very small and the max recursion depth is limited.
+        */
+       indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+                            btrfsic_get_block_type(state, block),
+                            (unsigned long long)block->logical_bytenr,
+                            block->dev_state->name,
+                            (unsigned long long)block->dev_bytenr,
+                            block->mirror_num);
+       if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+               printk("[...]\n");
+               return;
+       }
+       printk(buf);
+       indent_level += indent_add;
+       if (list_empty(&block->ref_to_list)) {
+               printk("\n");
+               return;
+       }
+       if (block->mirror_num > 1 &&
+           !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+               printk(" [...]\n");
+               return;
+       }
+
+       cursor_position = indent_level;
+       list_for_each(elem_ref_to, &block->ref_to_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_to, struct btrfsic_block_link,
+                              node_ref_to);
+
+               while (cursor_position < indent_level) {
+                       printk(" ");
+                       cursor_position++;
+               }
+               if (l->ref_cnt > 1)
+                       indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+               else
+                       indent_add = sprintf(buf, " --> ");
+               if (indent_level + indent_add >
+                   BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+                       printk("[...]\n");
+                       cursor_position = 0;
+                       continue;
+               }
+
+               printk(buf);
+
+               btrfsic_dump_tree_sub(state, l->block_ref_to,
+                                     indent_level + indent_add);
+               cursor_position = 0;
+       }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block *next_block,
+               struct btrfsic_block *from_block,
+               u64 parent_generation)
+{
+       struct btrfsic_block_link *l;
+
+       l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+                                               next_block_ctx->dev_bytenr,
+                                               from_block->dev_state->bdev,
+                                               from_block->dev_bytenr,
+                                               &state->block_link_hashtable);
+       if (NULL == l) {
+               l = btrfsic_block_link_alloc();
+               if (NULL == l) {
+                       printk(KERN_INFO
+                              "btrfsic: error, kmalloc" " failed!\n");
+                       return NULL;
+               }
+
+               l->block_ref_to = next_block;
+               l->block_ref_from = from_block;
+               l->ref_cnt = 1;
+               l->parent_generation = parent_generation;
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+
+               list_add(&l->node_ref_to, &from_block->ref_to_list);
+               list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+               btrfsic_block_link_hashtable_add(l,
+                                                &state->block_link_hashtable);
+       } else {
+               l->ref_cnt++;
+               l->parent_generation = parent_generation;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+       }
+
+       return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *block_ctx,
+               const char *additional_string,
+               int is_metadata,
+               int is_iodone,
+               int never_written,
+               int mirror_num,
+               int *was_created)
+{
+       struct btrfsic_block *block;
+
+       block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+                                              block_ctx->dev_bytenr,
+                                              &state->block_hashtable);
+       if (NULL == block) {
+               struct btrfsic_dev_state *dev_state;
+
+               block = btrfsic_block_alloc();
+               if (NULL == block) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       return NULL;
+               }
+               dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+               if (NULL == dev_state) {
+                       printk(KERN_INFO
+                              "btrfsic: error, lookup dev_state failed!\n");
+                       btrfsic_block_free(block);
+                       return NULL;
+               }
+               block->dev_state = dev_state;
+               block->dev_bytenr = block_ctx->dev_bytenr;
+               block->logical_bytenr = block_ctx->start;
+               block->is_metadata = is_metadata;
+               block->is_iodone = is_iodone;
+               block->never_written = never_written;
+               block->mirror_num = mirror_num;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "New %s%c-block @%llu (%s/%llu/%d)\n",
+                              additional_string,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              mirror_num);
+               list_add(&block->all_blocks_node, &state->all_blocks_list);
+               btrfsic_block_hashtable_add(block, &state->block_hashtable);
+               if (NULL != was_created)
+                       *was_created = 1;
+       } else {
+               if (NULL != was_created)
+                       *was_created = 0;
+       }
+
+       return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                          u64 bytenr,
+                                          struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, char *data)
+{
+       int num_copies;
+       int mirror_num;
+       int ret;
+       struct btrfsic_block_data_ctx block_ctx;
+       int match = 0;
+
+       num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     bytenr, PAGE_SIZE);
+
+       for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+               ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                       &block_ctx, mirror_num);
+               if (ret) {
+                       printk(KERN_INFO "btrfsic:"
+                              " btrfsic_map_block(logical @%llu,"
+                              " mirror %d) failed!\n",
+                              (unsigned long long)bytenr, mirror_num);
+                       continue;
+               }
+
+               if (dev_state->bdev == block_ctx.dev->bdev &&
+                   dev_bytenr == block_ctx.dev_bytenr) {
+                       match++;
+                       btrfsic_release_block_ctx(&block_ctx);
+                       break;
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       }
+
+       if (!match) {
+               printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+                      " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+                      " phys_bytenr=%llu)!\n",
+                      (unsigned long long)bytenr, dev_state->name,
+                      (unsigned long long)dev_bytenr);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                               &block_ctx, mirror_num);
+                       if (ret)
+                               continue;
+
+                       printk(KERN_INFO "Read logical bytenr @%llu maps to"
+                              " (%s/%llu/%d)\n",
+                              (unsigned long long)bytenr,
+                              block_ctx.dev->name,
+                              (unsigned long long)block_ctx.dev_bytenr,
+                              mirror_num);
+               }
+               WARN_ON(1);
+       }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+               struct block_device *bdev)
+{
+       struct btrfsic_dev_state *ds;
+
+       ds = btrfsic_dev_state_hashtable_lookup(bdev,
+                                               &btrfsic_dev_state_hashtable);
+       return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+       struct btrfsic_dev_state *dev_state;
+
+       if (!btrfsic_is_initialized)
+               return submit_bh(rw, bh);
+
+       mutex_lock(&btrfsic_mutex);
+       /* since btrfsic_submit_bh() might also be called before
+        * btrfsic_mount(), this might return NULL */
+       dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+       /* Only called to write the superblock (incl. FLUSH/FUA) */
+       if (NULL != dev_state &&
+           (rw & WRITE) && bh->b_size > 0) {
+               u64 dev_bytenr;
+
+               dev_bytenr = 4096 * bh->b_blocknr;
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+                              " size=%lu, data=%p, bdev=%p)\n",
+                              rw, bh->b_blocknr,
+                              (unsigned long long)dev_bytenr, bh->b_size,
+                              bh->b_data, bh->b_bdev);
+               btrfsic_process_written_block(dev_state, dev_bytenr,
+                                             bh->b_data, bh->b_size, NULL,
+                                             NULL, bh, rw);
+       } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                              rw, bh->b_bdev);
+               if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                       if ((dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "btrfsic_submit_bh(%s) with FLUSH"
+                                      " but dummy block already in use"
+                                      " (ignored)!\n",
+                                      dev_state->name);
+               } else {
+                       struct btrfsic_block *const block =
+                               &dev_state->dummy_block_for_bio_bh_flush;
+
+                       block->is_iodone = 0;
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       block->flush_gen = dev_state->last_flush_gen + 1;
+                       block->submit_bio_bh_rw = rw;
+                       block->orig_bio_bh_private = bh->b_private;
+                       block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                       block->next_in_same_bio = NULL;
+                       bh->b_private = block;
+                       bh->b_end_io = btrfsic_bh_end_io;
+               }
+       }
+       mutex_unlock(&btrfsic_mutex);
+       return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+       struct btrfsic_dev_state *dev_state;
+
+       if (!btrfsic_is_initialized) {
+               submit_bio(rw, bio);
+               return;
+       }
+
+       mutex_lock(&btrfsic_mutex);
+       /* since btrfsic_submit_bio() is also called before
+        * btrfsic_mount(), this might return NULL */
+       dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+       if (NULL != dev_state &&
+           (rw & WRITE) && NULL != bio->bi_io_vec) {
+               unsigned int i;
+               u64 dev_bytenr;
+               int bio_is_patched;
+
+               dev_bytenr = 512 * bio->bi_sector;
+               bio_is_patched = 0;
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bio(rw=0x%x, bi_vcnt=%u,"
+                              " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+                              rw, bio->bi_vcnt, bio->bi_sector,
+                              (unsigned long long)dev_bytenr,
+                              bio->bi_bdev);
+
+               for (i = 0; i < bio->bi_vcnt; i++) {
+                       u8 *mapped_data;
+
+                       mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                       if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                            BTRFSIC_PRINT_MASK_VERBOSE) ==
+                           (dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "#%u: page=%p, mapped=%p, len=%u,"
+                                      " offset=%u\n",
+                                      i, bio->bi_io_vec[i].bv_page,
+                                      mapped_data,
+                                      bio->bi_io_vec[i].bv_len,
+                                      bio->bi_io_vec[i].bv_offset);
+                       btrfsic_process_written_block(dev_state, dev_bytenr,
+                                                     mapped_data,
+                                                     bio->bi_io_vec[i].bv_len,
+                                                     bio, &bio_is_patched,
+                                                     NULL, rw);
+                       kunmap(bio->bi_io_vec[i].bv_page);
+                       dev_bytenr += bio->bi_io_vec[i].bv_len;
+               }
+       } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                              rw, bio->bi_bdev);
+               if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                       if ((dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "btrfsic_submit_bio(%s) with FLUSH"
+                                      " but dummy block already in use"
+                                      " (ignored)!\n",
+                                      dev_state->name);
+               } else {
+                       struct btrfsic_block *const block =
+                               &dev_state->dummy_block_for_bio_bh_flush;
+
+                       block->is_iodone = 0;
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       block->flush_gen = dev_state->last_flush_gen + 1;
+                       block->submit_bio_bh_rw = rw;
+                       block->orig_bio_bh_private = bio->bi_private;
+                       block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                       block->next_in_same_bio = NULL;
+                       bio->bi_private = block;
+                       bio->bi_end_io = btrfsic_bio_end_io;
+               }
+       }
+       mutex_unlock(&btrfsic_mutex);
+
+       submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+                 struct btrfs_fs_devices *fs_devices,
+                 int including_extent_data, u32 print_mask)
+{
+       int ret;
+       struct btrfsic_state *state;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+
+       state = kzalloc(sizeof(*state), GFP_NOFS);
+       if (NULL == state) {
+               printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+               return -1;
+       }
+
+       if (!btrfsic_is_initialized) {
+               mutex_init(&btrfsic_mutex);
+               btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+               btrfsic_is_initialized = 1;
+       }
+       mutex_lock(&btrfsic_mutex);
+       state->root = root;
+       state->print_mask = print_mask;
+       state->include_extent_data = including_extent_data;
+       state->csum_size = 0;
+       INIT_LIST_HEAD(&state->all_blocks_list);
+       btrfsic_block_hashtable_init(&state->block_hashtable);
+       btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+       state->max_superblock_generation = 0;
+       state->latest_superblock = NULL;
+
+       list_for_each_entry(device, dev_head, dev_list) {
+               struct btrfsic_dev_state *ds;
+               char *p;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               ds = btrfsic_dev_state_alloc();
+               if (NULL == ds) {
+                       printk(KERN_INFO
+                              "btrfs check-integrity: kmalloc() failed!\n");
+                       mutex_unlock(&btrfsic_mutex);
+                       return -1;
+               }
+               ds->bdev = device->bdev;
+               ds->state = state;
+               bdevname(ds->bdev, ds->name);
+               ds->name[BDEVNAME_SIZE - 1] = '\0';
+               for (p = ds->name; *p != '\0'; p++);
+               while (p > ds->name && *p != '/')
+                       p--;
+               if (*p == '/')
+                       p++;
+               strlcpy(ds->name, p, sizeof(ds->name));
+               btrfsic_dev_state_hashtable_add(ds,
+                                               &btrfsic_dev_state_hashtable);
+       }
+
+       ret = btrfsic_process_superblock(state, fs_devices);
+       if (0 != ret) {
+               mutex_unlock(&btrfsic_mutex);
+               btrfsic_unmount(root, fs_devices);
+               return ret;
+       }
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+               btrfsic_dump_database(state);
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+               btrfsic_dump_tree(state);
+
+       mutex_unlock(&btrfsic_mutex);
+       return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+                    struct btrfs_fs_devices *fs_devices)
+{
+       struct list_head *elem_all;
+       struct list_head *tmp_all;
+       struct btrfsic_state *state;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+
+       if (!btrfsic_is_initialized)
+               return;
+
+       mutex_lock(&btrfsic_mutex);
+
+       state = NULL;
+       list_for_each_entry(device, dev_head, dev_list) {
+               struct btrfsic_dev_state *ds;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               ds = btrfsic_dev_state_hashtable_lookup(
+                               device->bdev,
+                               &btrfsic_dev_state_hashtable);
+               if (NULL != ds) {
+                       state = ds->state;
+                       btrfsic_dev_state_hashtable_remove(ds);
+                       btrfsic_dev_state_free(ds);
+               }
+       }
+
+       if (NULL == state) {
+               printk(KERN_INFO
+                      "btrfsic: error, cannot find state information"
+                      " on umount!\n");
+               mutex_unlock(&btrfsic_mutex);
+               return;
+       }
+
+       /*
+        * Don't care about keeping the lists' state up to date,
+        * just free all memory that was allocated dynamically.
+        * Free the blocks and the block_links.
+        */
+       list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+               struct btrfsic_block *const b_all =
+                   list_entry(elem_all, struct btrfsic_block,
+                              all_blocks_node);
+               struct list_head *elem_ref_to;
+               struct list_head *tmp_ref_to;
+
+               list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                  &b_all->ref_to_list) {
+                       struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_rem_link(state, l);
+
+                       l->ref_cnt--;
+                       if (0 == l->ref_cnt)
+                               btrfsic_block_link_free(l);
+               }
+
+               if (b_all->is_iodone)
+                       btrfsic_block_free(b_all);
+               else
+                       printk(KERN_INFO "btrfs: attempt to free %c-block"
+                              " @%llu (%s/%llu/%d) on umount which is"
+                              " not yet iodone!\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num);
+       }
+
+       mutex_unlock(&btrfsic_mutex);
+
+       kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644 (file)
index 0000000..8b59175
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+                 struct btrfs_fs_devices *fs_devices,
+                 int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+                    struct btrfs_fs_devices *fs_devices);
+
+#endif
index dede441..0639a55 100644 (file)
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                     new_root_objectid, &disk_key, level,
-                                    buf->start, 0);
+                                    buf->start, 0, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-               ret = btrfs_inc_ref(trans, root, cow, 1);
+               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
        else
-               ret = btrfs_inc_ref(trans, root, cow, 0);
+               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
        if (ret)
                return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                       ret = btrfs_inc_ref(trans, root, buf, 1);
+                       ret = btrfs_inc_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                               ret = btrfs_dec_ref(trans, root, buf, 0);
+                               ret = btrfs_dec_ref(trans, root, buf, 0, 1);
                                BUG_ON(ret);
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                                BUG_ON(ret);
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0);
+                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
                }
                if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0);
+                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
-                       ret = btrfs_dec_ref(trans, root, buf, 1);
+                       ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                     root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size);
+                                    level, search_start, empty_size, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                rcu_assign_pointer(root->node, cow);
 
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref);
+                                     last_ref, 1);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref);
+                                     last_ref, 1);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                free_extent_buffer(mid);
 
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1);
+               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        root_sub_used(root, right->len);
-                       btrfs_free_tree_block(trans, root, right, 0, 1);
+                       btrfs_free_tree_block(trans, root, right, 0, 1, 0);
                        free_extent_buffer(right);
                        right = NULL;
                } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1);
+               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                free_extent_buffer(mid);
                mid = NULL;
        } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                   root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0);
+                                  level, root->node->start, 0, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, level, c->start, 0);
+                                       &disk_key, level, c->start, 0, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -2970,7 +2970,7 @@ again:
 
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0);
+                                       &disk_key, 0, l->start, 0, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
        root_sub_used(root, leaf->len);
 
-       btrfs_free_tree_block(trans, root, leaf, 0, 1);
+       btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
        return 0;
 }
 /*
index 6738503..27ebe61 100644 (file)
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
        __le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+       /*
+        * profiles to operate on, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 profiles;
+
+       /* usage filter */
+       __le64 usage;
+
+       /* devid filter */
+       __le64 devid;
+
+       /* devid subset filter [pstart..pend) */
+       __le64 pstart;
+       __le64 pend;
+
+       /* btrfs virtual address space subset filter [vstart..vend) */
+       __le64 vstart;
+       __le64 vend;
+
+       /*
+        * profile to convert to, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 target;
+
+       /* BTRFS_BALANCE_ARGS_* */
+       __le64 flags;
+
+       __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+       /* BTRFS_BALANCE_* */
+       __le64 flags;
+
+       struct btrfs_disk_balance_args data;
+       struct btrfs_disk_balance_args meta;
+       struct btrfs_disk_balance_args sys;
+
+       __le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP     (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES       5
+#define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA     (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0                (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES            5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
+                                        BTRFS_BLOCK_GROUP_SYSTEM |  \
+                                        BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
+                                        BTRFS_BLOCK_GROUP_RAID1 |   \
+                                        BTRFS_BLOCK_GROUP_DUP |     \
+                                        BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE   (1ULL << 48)
 
 struct btrfs_block_group_item {
        __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-       unsigned long mount_opt:20;
+       unsigned long mount_opt:21;
        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
 
+       /*
+        * these three are in extended format (availability of single
+        * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+        * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+        */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;
-       u64 data_alloc_profile;
-       u64 metadata_alloc_profile;
-       u64 system_alloc_profile;
+
+       /* restriper state */
+       spinlock_t balance_lock;
+       struct mutex balance_mutex;
+       atomic_t balance_running;
+       atomic_t balance_pause_req;
+       atomic_t balance_cancel_req;
+       struct btrfs_balance_control *balance_ctl;
+       wait_queue_head_t balance_wait_q;
 
        unsigned data_chunk_allocations;
        unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       u32 check_integrity_print_mask;
+#endif
+
        /* filesystem state */
        u64 fs_state;
 
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY     216
 #define BTRFS_CHUNK_ITEM_KEY   228
 
+#define BTRFS_BALANCE_ITEM_KEY 248
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG                (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE    (1 << 17)
 #define BTRFS_MOUNT_RECOVERY           (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE       (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY    (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+                                    struct btrfs_balance_item *bi,
+                                    struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+                                        struct btrfs_balance_item *bi,
+                                        struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+                              struct btrfs_disk_balance_args *disk)
+{
+       memset(cpu, 0, sizeof(*cpu));
+
+       cpu->profiles = le64_to_cpu(disk->profiles);
+       cpu->usage = le64_to_cpu(disk->usage);
+       cpu->devid = le64_to_cpu(disk->devid);
+       cpu->pstart = le64_to_cpu(disk->pstart);
+       cpu->pend = le64_to_cpu(disk->pend);
+       cpu->vstart = le64_to_cpu(disk->vstart);
+       cpu->vend = le64_to_cpu(disk->vend);
+       cpu->target = le64_to_cpu(disk->target);
+       cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+                              struct btrfs_balance_args *cpu)
+{
+       memset(disk, 0, sizeof(*disk));
+
+       disk->profiles = cpu_to_le64(cpu->profiles);
+       disk->usage = cpu_to_le64(cpu->usage);
+       disk->devid = cpu_to_le64(cpu->devid);
+       disk->pstart = cpu_to_le64(cpu->pstart);
+       disk->pend = cpu_to_le64(cpu->pend);
+       disk->vstart = cpu_to_le64(cpu->vstart);
+       disk->vend = cpu_to_le64(cpu->vend);
+       disk->target = cpu_to_le64(cpu->target);
+       disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
        return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
 }
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size);
+                                       u64 hint, u64 empty_size, int for_cow);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref);
+                          u64 parent, int last_ref, int for_cow);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref);
+                 struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref);
+                 struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset);
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset);
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+       ++p->slots[0];
+       if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+               return btrfs_next_leaf(root, p);
+       return 0;
+}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref);
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+       kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+       u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+       if (extended)
+               mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & mask)
+               return 0;
+       /* true if zero or exactly one bit set */
+       return (flags & (~flags + 1)) == flags;
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
index 9c1eccc..fe4cd0f 100644 (file)
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-       if (!ret)
+       if (!ret) {
+               trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                             item->key.objectid,
+                                             num_bytes, 1);
                item->bytes_reserved = num_bytes;
+       }
 
        return ret;
 }
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                return;
 
        rsv = &root->fs_info->delayed_block_rsv;
+       trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                     item->key.objectid, item->bytes_reserved,
+                                     0);
        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;
-       int release = false;
+       bool release = false;
 
        src_rsv = trans->block_rsv;
        dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
                 */
                if (ret == -EAGAIN)
                        ret = -ENOSPC;
-               if (!ret)
+               if (!ret) {
                        node->bytes_reserved = num_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "delayed_inode",
+                                                     btrfs_ino(inode),
+                                                     num_bytes, 1);
+               }
                return ret;
        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
         * reservation here.  I think it may be time for a documentation page on
         * how block rsvs. work.
         */
-       if (!ret)
+       if (!ret) {
+               trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                             btrfs_ino(inode), num_bytes, 1);
                node->bytes_reserved = num_bytes;
+       }
 
-       if (release)
+       if (release) {
+               trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                             btrfs_ino(inode), num_bytes, 0);
                btrfs_block_rsv_release(root, src_rsv, num_bytes);
+       }
 
        return ret;
 }
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
                return;
 
        rsv = &root->fs_info->delayed_block_rsv;
+       trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                     node->inode_id, node->bytes_reserved, 0);
        btrfs_block_rsv_release(root, rsv,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                goto release_node;
        }
 
-       ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-       /*
-        * we have reserved enough space when we start a new transaction,
-        * so reserving metadata failure is impossible
-        */
-       BUG_ON(ret);
-
        delayed_item->key.objectid = btrfs_ino(dir);
        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
        delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        dir_item->type = type;
        memcpy((char *)(dir_item + 1), name, name_len);
 
+       ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+       /*
+        * we have reserved enough space when we start a new transaction,
+        * so reserving metadata failure is impossible
+        */
+       BUG_ON(ret);
+
+
        mutex_lock(&delayed_node->mutex);
        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
        if (unlikely(ret)) {
index 125cf76..66e4f29 100644 (file)
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
                return -1;
        if (ref1->type > ref2->type)
                return 1;
+       /* merging of sequenced refs is not allowed */
+       if (ref1->seq < ref2->seq)
+               return -1;
+       if (ref1->seq > ref2->seq)
+               return 1;
        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                                  u64 bytenr,
-                                 struct btrfs_delayed_ref_node **last)
+                                 struct btrfs_delayed_ref_node **last,
+                                 int return_bigger)
 {
-       struct rb_node *n = root->rb_node;
+       struct rb_node *n;
        struct btrfs_delayed_ref_node *entry;
-       int cmp;
+       int cmp = 0;
 
+again:
+       n = root->rb_node;
+       entry = NULL;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                else
                        return entry;
        }
+       if (entry && return_bigger) {
+               if (cmp > 0) {
+                       n = rb_next(&entry->rb_node);
+                       if (!n)
+                               n = rb_first(root);
+                       entry = rb_entry(n, struct btrfs_delayed_ref_node,
+                                        rb_node);
+                       bytenr = entry->bytenr;
+                       return_bigger = 0;
+                       goto again;
+               }
+               return entry;
+       }
        return NULL;
 }
 
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq)
+{
+       struct seq_list *elem;
+
+       assert_spin_locked(&delayed_refs->lock);
+       if (list_empty(&delayed_refs->seq_head))
+               return 0;
+
+       elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+       if (seq >= elem->seq) {
+               pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+                        seq, elem->seq, delayed_refs);
+               return 1;
+       }
+       return 0;
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 start)
 {
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-               find_ref_head(&delayed_refs->root, start, &ref);
+               find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
                if (ref) {
-                       struct btrfs_delayed_ref_node *tmp;
-
-                       node = rb_prev(&ref->rb_node);
-                       while (node) {
-                               tmp = rb_entry(node,
-                                              struct btrfs_delayed_ref_node,
-                                              rb_node);
-                               if (tmp->bytenr < start)
-                                       break;
-                               ref = tmp;
-                               node = rb_prev(&ref->rb_node);
-                       }
                        node = &ref->rb_node;
                } else
                        node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_trans_handle *trans,
                                        struct btrfs_delayed_ref_node *ref,
                                        u64 bytenr, u64 num_bytes,
                                        int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
        ref->action  = 0;
        ref->is_head = 1;
        ref->in_tree = 1;
+       ref->seq = 0;
 
        head_ref = btrfs_delayed_node_to_head(ref);
        head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 ref_root, int level, int action)
+                                        u64 ref_root, int level, int action,
+                                        int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_tree_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+       u64 seq = 0;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
 
+       if (need_ref_seq(for_cow, ref_root))
+               seq = inc_delayed_seq(delayed_refs);
+       ref->seq = seq;
+
        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-       if (parent) {
-               full_ref->parent = parent;
+       full_ref->parent = parent;
+       full_ref->root = ref_root;
+       if (parent)
                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-       } else {
-               full_ref->root = ref_root;
+       else
                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-       }
        full_ref->level = level;
 
        trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
                                         u64 ref_root, u64 owner, u64 offset,
-                                        int action)
+                                        int action, int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_data_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+       u64 seq = 0;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
 
+       if (need_ref_seq(for_cow, ref_root))
+               seq = inc_delayed_seq(delayed_refs);
+       ref->seq = seq;
+
        full_ref = btrfs_delayed_node_to_data_ref(ref);
-       if (parent) {
-               full_ref->parent = parent;
+       full_ref->parent = parent;
+       full_ref->root = ref_root;
+       if (parent)
                ref->type = BTRFS_SHARED_DATA_REF_KEY;
-       } else {
-               full_ref->root = ref_root;
+       else
                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-       }
+
        full_ref->objectid = owner;
        full_ref->offset = offset;
 
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root,  int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op)
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow)
 {
        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                  action, 0);
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+                                  num_bytes, action, 0);
        BUG_ON(ret);
 
-       ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-                                  parent, ref_root, level, action);
+       ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+                                  num_bytes, parent, ref_root, level, action,
+                                  for_cow);
        BUG_ON(ret);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op)
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow)
 {
        struct btrfs_delayed_data_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                  action, 1);
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+                                  num_bytes, action, 1);
        BUG_ON(ret);
 
-       ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-                                  parent, ref_root, owner, offset, action);
+       ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+                                  num_bytes, parent, ref_root, owner, offset,
+                                  action, for_cow);
        BUG_ON(ret);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op)
 {
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
                                   extent_op->is_data);
        BUG_ON(ret);
 
+       if (waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+       ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
index e287e3b..d8f244d 100644 (file)
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
        /* the size of the extent */
        u64 num_bytes;
 
+       /* seq number to keep track of insertion order */
+       u64 seq;
+
        /* ref count on this data structure */
        atomic_t refs;
 
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
 
 struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
-       union {
-               u64 root;
-               u64 parent;
-       };
+       u64 root;
+       u64 parent;
        int level;
 };
 
 struct btrfs_delayed_data_ref {
        struct btrfs_delayed_ref_node node;
-       union {
-               u64 root;
-               u64 parent;
-       };
+       u64 root;
+       u64 parent;
        u64 objectid;
        u64 offset;
 };
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
        int flushing;
 
        u64 run_delayed_start;
+
+       /*
+        * seq number of delayed refs. We need to know if a backref was being
+        * added before the currently processed ref or afterwards.
+        */
+       u64 seq;
+
+       /*
+        * seq_list holds a list of all seq numbers that are currently being
+        * added to the list. While walking backrefs (btrfs_find_all_roots,
+        * qgroups), which might take some time, no newer ref must be processed,
+        * as it might influence the outcome of the walk.
+        */
+       struct list_head seq_head;
+
+       /*
+        * when the only refs we have in the list must not be processed, we want
+        * to wait for more refs to show up or for the end of backref walking.
+        */
+       wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root, int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
 
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+       assert_spin_locked(&delayed_refs->lock);
+       ++delayed_refs->seq;
+       return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                     struct seq_list *elem)
+{
+       assert_spin_locked(&delayed_refs->lock);
+       elem->seq = delayed_refs->seq;
+       list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                     struct seq_list *elem)
+{
+       spin_lock(&delayed_refs->lock);
+       list_del(&elem->list);
+       wake_up(&delayed_refs->seq_wait);
+       spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+       if (for_cow)
+               return 0;
+
+       if (rootid == BTRFS_FS_TREE_OBJECTID)
+               return 1;
+
+       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+
+       return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
index d852566..7aa9cd3 100644 (file)
@@ -43,6 +43,7 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -1143,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->orphan_item_inserted = 0;
        root->orphan_cleanup_state = 0;
 
-       root->fs_info = fs_info;
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
@@ -1217,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
 
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+       if (root)
+               root->fs_info = fs_info;
+       return root;
+}
+
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1224,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct extent_buffer *leaf;
 
-       root = kzalloc(sizeof(*root), GFP_NOFS);
+       root = btrfs_alloc_root(fs_info);
        if (!root)
                return ERR_PTR(-ENOMEM);
 
@@ -1244,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        root->ref_cows = 0;
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                     BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+                                     BTRFS_TREE_LOG_OBJECTID, NULL,
+                                     0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1318,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        u32 blocksize;
        int ret = 0;
 
-       root = kzalloc(sizeof(*root), GFP_NOFS);
+       root = btrfs_alloc_root(fs_info);
        if (!root)
                return ERR_PTR(-ENOMEM);
        if (location->offset == (u64)-1) {
@@ -1874,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 }
 
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-                             struct btrfs_fs_devices *fs_devices,
-                             char *options)
+int open_ctree(struct super_block *sb,
+              struct btrfs_fs_devices *fs_devices,
+              char *options)
 {
        u32 sectorsize;
        u32 nodesize;
@@ -1888,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        struct btrfs_key location;
        struct buffer_head *bh;
        struct btrfs_super_block *disk_super;
-       struct btrfs_root *tree_root = btrfs_sb(sb);
-       struct btrfs_fs_info *fs_info = tree_root->fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *tree_root;
        struct btrfs_root *extent_root;
        struct btrfs_root *csum_root;
        struct btrfs_root *chunk_root;
@@ -1900,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        int num_backups_tried = 0;
        int backup_index = 0;
 
-       extent_root = fs_info->extent_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       csum_root = fs_info->csum_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       chunk_root = fs_info->chunk_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       dev_root = fs_info->dev_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+       tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+       extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+       csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+       chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+       dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 
-       if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+       if (!tree_root || !extent_root || !csum_root ||
+           !chunk_root || !dev_root) {
                err = -ENOMEM;
                goto fail;
        }
@@ -1998,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       fs_info->check_integrity_print_mask = 0;
+#endif
+
+       spin_lock_init(&fs_info->balance_lock);
+       mutex_init(&fs_info->balance_mutex);
+       atomic_set(&fs_info->balance_running, 0);
+       atomic_set(&fs_info->balance_pause_req, 0);
+       atomic_set(&fs_info->balance_cancel_req, 0);
+       fs_info->balance_ctl = NULL;
+       init_waitqueue_head(&fs_info->balance_wait_q);
 
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -2267,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
           BTRFS_UUID_SIZE);
 
-       mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_chunk_tree(chunk_root);
-       mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
@@ -2318,9 +2334,6 @@ retry_root_backup:
 
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-       fs_info->data_alloc_profile = (u64)-1;
-       fs_info->metadata_alloc_profile = (u64)-1;
-       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
@@ -2353,6 +2366,19 @@ retry_root_backup:
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+               ret = btrfsic_mount(tree_root, fs_devices,
+                                   btrfs_test_opt(tree_root,
+                                       CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+                                   1 : 0,
+                                   fs_info->check_integrity_print_mask);
+               if (ret)
+                       printk(KERN_WARNING "btrfs: failed to initialize"
+                              " integrity check module %s\n", sb->s_id);
+       }
+#endif
+
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0 &&
            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2368,7 +2394,7 @@ retry_root_backup:
                     btrfs_level_size(tree_root,
                                      btrfs_super_log_root_level(disk_super));
 
-               log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+               log_tree_root = btrfs_alloc_root(fs_info);
                if (!log_tree_root) {
                        err = -ENOMEM;
                        goto fail_trans_kthread;
@@ -2423,13 +2449,17 @@ retry_root_backup:
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+
+               if (!err)
+                       err = btrfs_recover_balance(fs_info->tree_root);
+
                if (err) {
                        close_ctree(tree_root);
-                       return ERR_PTR(err);
+                       return err;
                }
        }
 
-       return tree_root;
+       return 0;
 
 fail_trans_kthread:
        kthread_stop(fs_info->transaction_kthread);
@@ -2475,8 +2505,7 @@ fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
        btrfs_close_devices(fs_info->fs_devices);
-       free_fs_info(fs_info);
-       return ERR_PTR(err);
+       return err;
 
 recovery_tree_root:
        if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2631,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device,
                 * we fua the first super.  The others we allow
                 * to go down lazy.
                 */
-               ret = submit_bh(WRITE_FUA, bh);
+               ret = btrfsic_submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
@@ -2708,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
        device->flush_bio = bio;
 
        bio_get(bio);
-       submit_bio(WRITE_FLUSH, bio);
+       btrfsic_submit_bio(WRITE_FLUSH, bio);
 
        return 0;
 }
@@ -2972,6 +3001,9 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
 
+       /* pause restriper - we want to resume on mount */
+       btrfs_pause_balance(root->fs_info);
+
        btrfs_scrub_cancel(root);
 
        /* wait for any defraggers to finish */
@@ -2979,7 +3011,7 @@ int close_ctree(struct btrfs_root *root)
                   (atomic_read(&fs_info->defrag_running) == 0));
 
        /* clear out the rbtree of defraggable inodes */
-       btrfs_run_defrag_inodes(root->fs_info);
+       btrfs_run_defrag_inodes(fs_info);
 
        /*
         * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3008,8 +3040,8 @@ int close_ctree(struct btrfs_root *root)
 
        btrfs_put_block_group_cache(fs_info);
 
-       kthread_stop(root->fs_info->transaction_kthread);
-       kthread_stop(root->fs_info->cleaner_kthread);
+       kthread_stop(fs_info->transaction_kthread);
+       kthread_stop(fs_info->cleaner_kthread);
 
        fs_info->closing = 2;
        smp_mb();
@@ -3027,14 +3059,14 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(fs_info->extent_root->commit_root);
        free_extent_buffer(fs_info->tree_root->node);
        free_extent_buffer(fs_info->tree_root->commit_root);
-       free_extent_buffer(root->fs_info->chunk_root->node);
-       free_extent_buffer(root->fs_info->chunk_root->commit_root);
-       free_extent_buffer(root->fs_info->dev_root->node);
-       free_extent_buffer(root->fs_info->dev_root->commit_root);
-       free_extent_buffer(root->fs_info->csum_root->node);
-       free_extent_buffer(root->fs_info->csum_root->commit_root);
+       free_extent_buffer(fs_info->chunk_root->node);
+       free_extent_buffer(fs_info->chunk_root->commit_root);
+       free_extent_buffer(fs_info->dev_root->node);
+       free_extent_buffer(fs_info->dev_root->commit_root);
+       free_extent_buffer(fs_info->csum_root->node);
+       free_extent_buffer(fs_info->csum_root->commit_root);
 
-       btrfs_free_block_groups(root->fs_info);
+       btrfs_free_block_groups(fs_info);
 
        del_fs_roots(fs_info);
 
@@ -3054,14 +3086,17 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(root, CHECK_INTEGRITY))
+               btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-       free_fs_info(fs_info);
-
        return 0;
 }
 
index c99d0a8..e4bc474 100644 (file)
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
-                             struct btrfs_fs_devices *fs_devices,
-                             char *options);
+int open_ctree(struct super_block *sb,
+              struct btrfs_fs_devices *fs_devices,
+              char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
index 1b8dc33..5f77166 100644 (file)
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                                       u64 root_objectid, u32 generation,
                                       int check_generation)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root;
        struct inode *inode;
        struct btrfs_key key;
index f5fbe57..700879e 100644 (file)
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
 
-       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-                BTRFS_BLOCK_GROUP_METADATA;
+       flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset)
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        }
        return ret;
 }
@@ -2232,6 +2235,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        }
                }
 
+               /*
+                * locked_ref is the head node, so we have to go one
+                * node back for any delayed ref updates
+                */
+               ref = select_delayed_ref(locked_ref);
+
+               if (ref && ref->seq &&
+                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                       /*
+                        * there are still refs with lower seq numbers in the
+                        * process of being added. Don't run this ref yet.
+                        */
+                       list_del_init(&locked_ref->cluster);
+                       mutex_unlock(&locked_ref->mutex);
+                       locked_ref = NULL;
+                       delayed_refs->num_heads_ready++;
+                       spin_unlock(&delayed_refs->lock);
+                       cond_resched();
+                       spin_lock(&delayed_refs->lock);
+                       continue;
+               }
+
                /*
                 * record the must insert reserved flag before we
                 * drop the spin lock.
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                extent_op = locked_ref->extent_op;
                locked_ref->extent_op = NULL;
 
-               /*
-                * locked_ref is the head node, so we have to go one
-                * node back for any delayed ref updates
-                */
-               ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                BUG_ON(ret);
                                kfree(extent_op);
 
-                               cond_resched();
-                               spin_lock(&delayed_refs->lock);
-                               continue;
+                               goto next;
                        }
 
                        list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-
+               /*
+                * we modified num_entries, but as we're currently running
+                * delayed refs, skip
+                *     wake_up(&delayed_refs->seq_wait);
+                * here.
+                */
                spin_unlock(&delayed_refs->lock);
 
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                btrfs_put_delayed_ref(ref);
                kfree(extent_op);
                count++;
-
+next:
+               do_chunk_alloc(trans, root->fs_info->extent_root,
+                              2 * 1024 * 1024,
+                              btrfs_get_alloc_profile(root, 0),
+                              CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        return count;
 }
 
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+                       unsigned long num_refs)
+{
+       struct list_head *first_seq = delayed_refs->seq_head.next;
+
+       spin_unlock(&delayed_refs->lock);
+       pr_debug("waiting for more refs (num %ld, first %p)\n",
+                num_refs, first_seq);
+       wait_event(delayed_refs->seq_wait,
+                  num_refs != delayed_refs->num_entries ||
+                  delayed_refs->seq_head.next != first_seq);
+       pr_debug("done waiting for more refs (num %ld, first %p)\n",
+                delayed_refs->num_entries, delayed_refs->seq_head.next);
+       spin_lock(&delayed_refs->lock);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
        int ret;
+       u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
+       unsigned long num_refs = 0;
+       int consider_waiting;
 
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
+       do_chunk_alloc(trans, root->fs_info->extent_root,
+                      2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+                      CHUNK_ALLOC_NO_FORCE);
+
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
+       consider_waiting = 0;
        spin_lock(&delayed_refs->lock);
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
                 * of refs to process starting at the first one we are able to
                 * lock
                 */
+               delayed_start = delayed_refs->run_delayed_start;
                ret = btrfs_find_ref_cluster(trans, &cluster,
                                             delayed_refs->run_delayed_start);
                if (ret)
                        break;
 
+               if (delayed_start >= delayed_refs->run_delayed_start) {
+                       if (consider_waiting == 0) {
+                               /*
+                                * btrfs_find_ref_cluster looped. let's do one
+                                * more cycle. if we don't run any delayed ref
+                                * during that cycle (because we can't because
+                                * all of them are blocked) and if the number of
+                                * refs doesn't change, we avoid busy waiting.
+                                */
+                               consider_waiting = 1;
+                               num_refs = delayed_refs->num_entries;
+                       } else {
+                               wait_for_more_refs(delayed_refs, num_refs);
+                               /*
+                                * after waiting, things have changed. we
+                                * dropped the lock and someone else might have
+                                * run some refs, built new clusters and so on.
+                                * therefore, we restart staleness detection.
+                                */
+                               consider_waiting = 0;
+                       }
+               }
+
                ret = run_clustered_refs(trans, root, &cluster);
                BUG_ON(ret < 0);
 
@@ -2348,6 +2424,11 @@ again:
 
                if (count == 0)
                        break;
+
+               if (ret || delayed_refs->run_delayed_start == 0) {
+                       /* refs were run, let's reset staleness detection */
+                       consider_waiting = 0;
+               }
        }
 
        if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        extent_op->update_key = 0;
        extent_op->is_data = is_data ? 1 : 0;
 
-       ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+       ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+                                         num_bytes, extent_op);
        if (ret)
                kfree(extent_op);
        return ret;
@@ -2590,7 +2672,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          int full_backref, int inc)
+                          int full_backref, int inc, int for_cow)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, int);
 
        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                          key.offset);
+                                          key.offset, for_cow);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0);
+                                          parent, ref_root, level - 1, 0,
+                                          for_cow);
                        if (ret)
                                goto fail;
                }
@@ -2659,15 +2742,15 @@ fail:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-                               BTRFS_BLOCK_GROUP_SYSTEM |
-                               BTRFS_BLOCK_GROUP_METADATA);
+       found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        found->total_bytes = total_bytes;
        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-       u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-                                  BTRFS_BLOCK_GROUP_RAID1 |
-                                  BTRFS_BLOCK_GROUP_RAID10 |
-                                  BTRFS_BLOCK_GROUP_DUP);
-       if (extra_flags) {
-               if (flags & BTRFS_BLOCK_GROUP_DATA)
-                       fs_info->avail_data_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                       fs_info->avail_metadata_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                       fs_info->avail_system_alloc_bits |= extra_flags;
-       }
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits |= extra_flags;
 }
 
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
 
+       /* pick restriper's target profile if it's available */
+       spin_lock(&root->fs_info->balance_lock);
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+                   (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                   (flags & bctl->data.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                          (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->sys.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                          (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->meta.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       spin_unlock(&root->fs_info->balance_lock);
+                       flags = tgt;
+                       goto out;
+               }
+       }
+       spin_unlock(&root->fs_info->balance_lock);
+
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
        if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
             (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP)))
+            (flags & BTRFS_BLOCK_GROUP_DUP))) {
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+       }
+
+out:
+       /* extended -> chunk profile */
+       flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        return flags;
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits &
-                        root->fs_info->data_alloc_profile;
+               flags |= root->fs_info->avail_data_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits &
-                        root->fs_info->system_alloc_profile;
+               flags |= root->fs_info->avail_system_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits &
-                        root->fs_info->metadata_alloc_profile;
+               flags |= root->fs_info->avail_metadata_alloc_bits;
+
        return btrfs_reduce_alloc_profile(root, flags);
 }
 
@@ -3191,6 +3310,8 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     (u64)data_sinfo, bytes, 1);
        spin_unlock(&data_sinfo->lock);
 
        return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = BTRFS_I(inode)->space_info;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     (u64)data_sinfo, bytes, 0);
        spin_unlock(&data_sinfo->lock);
 }
 
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-
-       /*
-        * we have two similar checks here, one based on percentage
-        * and once based on a hard number of 256MB.  The idea
-        * is that if we have a good amount of free
-        * room, don't allocate a chunk.  A good mount is
-        * less than 80% utilized of the chunks we have allocated,
-        * or more than 256MB free
-        */
-       if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-               return 0;
-
-       if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-               return 0;
-
        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-       /* 256MB or 5% of the FS */
-       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+       /* 256MB or 2% of the FS */
+       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+       /* system chunks need a much small threshold */
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               thresh = 32 * 1024 * 1024;
 
-       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
 
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       BUG_ON(!profile_is_valid(flags, 0));
 
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
        if (used <= space_info->total_bytes) {
                if (used + orig_bytes <= space_info->total_bytes) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "space_info",
+                                                     (u64)space_info,
+                                                     orig_bytes, 1);
                        ret = 0;
                } else {
                        /*
@@ -3649,6 +3764,10 @@ again:
 
                if (used + num_bytes < space_info->total_bytes + avail) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "space_info",
+                                                     (u64)space_info,
+                                                     orig_bytes, 1);
                        ret = 0;
                } else {
                        wait_ordered = true;
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_may_use -= num_bytes;
+                       trace_btrfs_space_reservation(fs_info, "space_info",
+                                                     (u64)space_info,
+                                                     num_bytes, 0);
                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
        if (global_rsv->full || global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
-       block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+       block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+                               num_bytes);
 }
 
 /*
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
                num_bytes = sinfo->total_bytes - num_bytes;
                block_rsv->reserved += num_bytes;
                sinfo->bytes_may_use += num_bytes;
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                             (u64)sinfo, num_bytes, 1);
        }
 
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_may_use -= num_bytes;
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                             (u64)sinfo, num_bytes, 0);
                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-       block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+       block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+                               (u64)-1);
        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
        if (!trans->bytes_reserved)
                return;
 
+       trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
+                                     trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
        trans->bytes_reserved = 0;
 }
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         * when we are truly done with the orphan item.
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                     btrfs_ino(inode), num_bytes, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                     btrfs_ino(inode), num_bytes, 0);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        /* Need to be holding the i_mutex here if we aren't free space cache */
        if (btrfs_is_free_space_inode(root, inode))
                flush = 0;
-       else
-               WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
 
+       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
 
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                if (dropped)
                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-               if (to_free)
+               if (to_free) {
                        btrfs_block_rsv_release(root, block_rsv, to_free);
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "delalloc",
+                                                     btrfs_ino(inode),
+                                                     to_free, 0);
+               }
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
 
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
+       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
+       if (to_reserve)
+               trace_btrfs_space_reservation(root->fs_info,"delalloc",
+                                             btrfs_ino(inode), to_reserve, 1);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
        return 0;
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+       trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                     btrfs_ino(inode), to_free, 0);
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                        cache->reserved += num_bytes;
                        space_info->bytes_reserved += num_bytes;
                        if (reserve == RESERVE_ALLOC) {
-                               BUG_ON(space_info->bytes_may_use < num_bytes);
+                               trace_btrfs_space_reservation(cache->fs_info,
+                                                             "space_info",
+                                                             (u64)space_info,
+                                                             num_bytes, 0);
                                space_info->bytes_may_use -= num_bytes;
                        }
                }
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
 
        delayed_refs->num_entries--;
+       if (waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
 
        /*
         * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5106,17 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref)
+                          u64 parent, int last_ref, int for_cow)
 {
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-                                               parent, root->root_key.objectid,
-                                               btrfs_header_level(buf),
-                                               BTRFS_DROP_DELAYED_REF, NULL);
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       buf->start, buf->len,
+                                       parent, root->root_key.objectid,
+                                       btrfs_header_level(buf),
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                BUG_ON(ret);
        }
 
@@ -4999,12 +5151,12 @@ out:
        btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
 
        /*
         * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                btrfs_pin_extent(root, bytenr, num_bytes, 1);
                ret = 0;
        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL);
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                BUG_ON(ret);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-                                       parent, root_objectid, owner,
-                                       offset, BTRFS_DROP_DELAYED_REF, NULL);
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                               num_bytes,
+                                               parent, root_objectid, owner,
+                                               offset, BTRFS_DROP_DELAYED_REF,
+                                               NULL, for_cow);
                BUG_ON(ret);
        }
        return ret;
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        ins->objectid = 0;
        ins->offset = 0;
 
+       trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
        space_info = __find_space_info(root->fs_info, data);
        if (!space_info) {
                printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5452,6 @@ alloc:
                if (unlikely(block_group->ro))
                        goto loop;
 
-               spin_lock(&block_group->free_space_ctl->tree_lock);
-               if (cached &&
-                   block_group->free_space_ctl->free_space <
-                   num_bytes + empty_cluster + empty_size) {
-                       spin_unlock(&block_group->free_space_ctl->tree_lock);
-                       goto loop;
-               }
-               spin_unlock(&block_group->free_space_ctl->tree_lock);
-
                /*
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
@@ -5331,6 +5479,8 @@ alloc:
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
+                               trace_btrfs_reserve_extent_cluster(root,
+                                       block_group, search_start, num_bytes);
                                goto checks;
                        }
 
@@ -5349,8 +5499,15 @@ refill_cluster:
                         * plenty of times and not have found
                         * anything, so we are likely way too
                         * fragmented for the clustering stuff to find
-                        * anything.  */
-                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                        * anything.
+                        *
+                        * However, if the cluster is taken from the
+                        * current block group, release the cluster
+                        * first, so that we stand a better chance of
+                        * succeeding in the unclustered
+                        * allocation.  */
+                       if (loop >= LOOP_NO_EMPTY_SIZE &&
+                           last_ptr->block_group != block_group) {
                                spin_unlock(&last_ptr->refill_lock);
                                goto unclustered_alloc;
                        }
@@ -5361,6 +5518,11 @@ refill_cluster:
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                               spin_unlock(&last_ptr->refill_lock);
+                               goto unclustered_alloc;
+                       }
+
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
@@ -5377,6 +5539,9 @@ refill_cluster:
                                if (offset) {
                                        /* we found one, proceed */
                                        spin_unlock(&last_ptr->refill_lock);
+                                       trace_btrfs_reserve_extent_cluster(root,
+                                               block_group, search_start,
+                                               num_bytes);
                                        goto checks;
                                }
                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5566,15 @@ refill_cluster:
                }
 
 unclustered_alloc:
+               spin_lock(&block_group->free_space_ctl->tree_lock);
+               if (cached &&
+                   block_group->free_space_ctl->free_space <
+                   num_bytes + empty_cluster + empty_size) {
+                       spin_unlock(&block_group->free_space_ctl->tree_lock);
+                       goto loop;
+               }
+               spin_unlock(&block_group->free_space_ctl->tree_lock);
+
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5438,9 +5612,6 @@ checks:
                        goto loop;
                }
 
-               ins->objectid = search_start;
-               ins->offset = num_bytes;
-
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5457,6 +5628,8 @@ checks:
                ins->objectid = search_start;
                ins->offset = num_bytes;
 
+               trace_btrfs_reserve_extent(orig_root, block_group,
+                                          search_start, num_bytes);
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-       ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-                                        0, root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL);
+       ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+                                        ins->offset, 0,
+                                        root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
        return ret;
 }
 
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        return ERR_PTR(-ENOSPC);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+                           struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
        block_rsv_add_bytes(block_rsv, blocksize, 0);
-       block_rsv_release_bytes(block_rsv, NULL, 0);
+       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size)
+                                       u64 hint, u64 empty_size, int for_cow)
 {
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-               unuse_block_rsv(block_rsv, blocksize);
+               unuse_block_rsv(root->fs_info, block_rsv, blocksize);
                return ERR_PTR(ret);
        }
 
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                extent_op->update_flags = 1;
                extent_op->is_data = 0;
 
-               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       ins.objectid,
                                        ins.offset, parent, root_objectid,
                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op);
+                                       extent_op, for_cow);
                BUG_ON(ret);
        }
        return buf;
@@ -6078,6 +6254,7 @@ struct walk_control {
        int keep_locks;
        int reada_slot;
        int reada_count;
+       int for_reloc;
 };
 
 #define DROP_REFERENCE 1
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
-               ret = btrfs_inc_ref(trans, root, eb, 1);
+               ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
                BUG_ON(ret);
-               ret = btrfs_dec_ref(trans, root, eb, 0);
+               ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
                BUG_ON(ret);
                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                  eb->len, flag, 0);
@@ -6362,7 +6539,7 @@ skip:
                }
 
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                                       root->root_key.objectid, level - 1, 0);
+                               root->root_key.objectid, level - 1, 0, 0);
                BUG_ON(ret);
        }
        btrfs_tree_unlock(next);
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                               ret = btrfs_dec_ref(trans, root, eb, 1);
+                               ret = btrfs_dec_ref(trans, root, eb, 1,
+                                                   wc->for_reloc);
                        else
-                               ret = btrfs_dec_ref(trans, root, eb, 0);
+                               ret = btrfs_dec_ref(trans, root, eb, 0,
+                                                   wc->for_reloc);
                        BUG_ON(ret);
                }
                /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
 
-       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * blocks are properly updated.
  */
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref)
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = update_ref;
        wc->keep_locks = 0;
+       wc->for_reloc = for_reloc;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
@@ -6721,6 +6902,7 @@ out:
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = 0;
        wc->keep_locks = 1;
+       wc->for_reloc = 1;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               /* pick restriper's target profile and return */
+               if (flags & BTRFS_BLOCK_GROUP_DATA &&
+                   bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+                          bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+                          bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       /* extended -> chunk profile */
+                       tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+                       return tgt;
+               }
+       }
+
        /*
         * we add in the count of missing devices because we want
         * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 * space to fit our block group in.
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
-                       ret = find_free_dev_extent(NULL, device, min_free,
+                       ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
+       update_global_block_rsv(root->fs_info);
 
        spin_lock(&cache->space_info->lock);
        cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start)
 {
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct inode *inode;
        int ret;
+       int index;
        int factor;
 
        root = root->fs_info->extent_root;
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        free_excluded_extents(root, block_group);
 
        memcpy(&key, &block_group->key, sizeof(key));
+       index = get_block_group_index(block_group);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+       if (list_empty(&block_group->space_info->block_groups[index]))
+               clear_avail_alloc_bits(root->fs_info, block_group->flags);
        up_write(&block_group->space_info->groups_sem);
 
        if (block_group->cached == BTRFS_CACHE_STARTED)
index 49f3c9d..9d09a4f 100644 (file)
@@ -18,6 +18,7 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
+#include "check-integrity.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        }
        bio->bi_bdev = dev->bdev;
        bio_add_page(bio, page, length, start-page_offset(page));
-       submit_bio(WRITE_SYNC, bio);
+       btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
 
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
 
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_set(&eb->blocking_writers, 0);
        atomic_set(&eb->spinning_readers, 0);
        atomic_set(&eb->spinning_writers, 0);
+       eb->lock_nested = 0;
        init_waitqueue_head(&eb->write_lock_wq);
        init_waitqueue_head(&eb->read_lock_wq);
 
index 7604c30..bc6a042 100644 (file)
@@ -129,6 +129,7 @@ struct extent_buffer {
        struct list_head leak_list;
        struct rcu_head rcu_head;
        atomic_t refs;
+       pid_t lock_owner;
 
        /* count of read lock holders on the extent buffer */
        atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
        atomic_t blocking_readers;
        atomic_t spinning_readers;
        atomic_t spinning_writers;
+       int lock_nested;
 
        /* protects write locks */
        rwlock_t lock;
index 034d985..859ba2d 100644 (file)
@@ -678,7 +678,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                               start - extent_offset);
+                                               start - extent_offset, 0);
                                BUG_ON(ret);
                                *hint_byte = disk_bytenr;
                        }
@@ -753,7 +753,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                key.objectid, key.offset -
-                                               extent_offset);
+                                               extent_offset, 0);
                                BUG_ON(ret);
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
 
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                          ino, orig_offset);
+                                          ino, orig_offset, 0);
                BUG_ON(ret);
 
                if (split == start) {
@@ -989,7 +989,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       ino, orig_offset);
+                                       ino, orig_offset, 0);
                BUG_ON(ret);
        }
        other_start = 0;
@@ -1006,7 +1006,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       ino, orig_offset);
+                                       ino, orig_offset, 0);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                                   dirty_pages);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
                        btrfs_btree_balance_dirty(root, 1);
-               btrfs_throttle(root);
 
                pos += copied;
                num_written += copied;
index 9a897bf..d20ff87 100644 (file)
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
        io_ctl_unmap_page(io_ctl);
 
        for (i = 0; i < io_ctl->num_pages; i++) {
-               ClearPageChecked(io_ctl->pages[i]);
-               unlock_page(io_ctl->pages[i]);
-               page_cache_release(io_ctl->pages[i]);
+               if (io_ctl->pages[i]) {
+                       ClearPageChecked(io_ctl->pages[i]);
+                       unlock_page(io_ctl->pages[i]);
+                       page_cache_release(io_ctl->pages[i]);
+               }
        }
 }
 
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        if (!num_entries)
                return 0;
 
-       io_ctl_init(&io_ctl, inode, root);
+       ret = io_ctl_init(&io_ctl, inode, root);
+       if (ret)
+               return ret;
+
        ret = readahead_cache(inode);
        if (ret)
                goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        struct io_ctl io_ctl;
        struct list_head bitmap_list;
        struct btrfs_key key;
-       u64 start, end, len;
+       u64 start, extent_start, extent_end, len;
        int entries = 0;
        int bitmaps = 0;
        int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (!i_size_read(inode))
                return -1;
 
-       io_ctl_init(&io_ctl, inode, root);
+       ret = io_ctl_init(&io_ctl, inode, root);
+       if (ret)
+               return -1;
 
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                     struct btrfs_free_cluster,
                                     block_group_list);
 
-       /*
-        * We shouldn't have switched the pinned extents yet so this is the
-        * right one
-        */
-       unpin = root->fs_info->pinned_extents;
-
        /* Lock all pages first so we can lock the extent safely. */
        io_ctl_prepare_pages(&io_ctl, inode, 0);
 
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state, GFP_NOFS);
 
-       /*
-        * When searching for pinned extents, we need to start at our start
-        * offset.
-        */
-       if (block_group)
-               start = block_group->key.objectid;
-
        node = rb_first(&ctl->free_space_offset);
        if (!node && cluster) {
                node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * We want to add any pinned extents to our free space cache
         * so we don't leak the space
         */
+
+       /*
+        * We shouldn't have switched the pinned extents yet so this is the
+        * right one
+        */
+       unpin = root->fs_info->pinned_extents;
+
+       if (block_group)
+               start = block_group->key.objectid;
+
        while (block_group && (start < block_group->key.objectid +
                               block_group->key.offset)) {
-               ret = find_first_extent_bit(unpin, start, &start, &end,
+               ret = find_first_extent_bit(unpin, start,
+                                           &extent_start, &extent_end,
                                            EXTENT_DIRTY);
                if (ret) {
                        ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                }
 
                /* This pinned extent is out of our range */
-               if (start >= block_group->key.objectid +
+               if (extent_start >= block_group->key.objectid +
                    block_group->key.offset)
                        break;
 
-               len = block_group->key.objectid +
-                       block_group->key.offset - start;
-               len = min(len, end + 1 - start);
+               extent_start = max(extent_start, start);
+               extent_end = min(block_group->key.objectid +
+                                block_group->key.offset, extent_end + 1);
+               len = extent_end - extent_start;
 
                entries++;
-               ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+               ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
                if (ret)
                        goto out_nospc;
 
-               start = end + 1;
+               start = extent_end;
        }
 
        /* Write out the bitmaps */
@@ -2283,23 +2289,23 @@ out:
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
                                struct btrfs_free_space *entry,
                                struct btrfs_free_cluster *cluster,
-                               u64 offset, u64 bytes, u64 min_bytes)
+                               u64 offset, u64 bytes,
+                               u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        unsigned long next_zero;
        unsigned long i;
-       unsigned long search_bits;
-       unsigned long total_bits;
+       unsigned long want_bits;
+       unsigned long min_bits;
        unsigned long found_bits;
        unsigned long start = 0;
        unsigned long total_found = 0;
        int ret;
-       bool found = false;
 
        i = offset_to_bit(entry->offset, block_group->sectorsize,
                          max_t(u64, offset, entry->offset));
-       search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-       total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+       want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+       min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 
 again:
        found_bits = 0;
@@ -2308,7 +2314,7 @@ again:
             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
                next_zero = find_next_zero_bit(entry->bitmap,
                                               BITS_PER_BITMAP, i);
-               if (next_zero - i >= search_bits) {
+               if (next_zero - i >= min_bits) {
                        found_bits = next_zero - i;
                        break;
                }
@@ -2318,10 +2324,9 @@ again:
        if (!found_bits)
                return -ENOSPC;
 
-       if (!found) {
+       if (!total_found) {
                start = i;
                cluster->max_size = 0;
-               found = true;
        }
 
        total_found += found_bits;
@@ -2329,13 +2334,8 @@ again:
        if (cluster->max_size < found_bits * block_group->sectorsize)
                cluster->max_size = found_bits * block_group->sectorsize;
 
-       if (total_found < total_bits) {
-               i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-               if (i - start > total_bits * 2) {
-                       total_found = 0;
-                       cluster->max_size = 0;
-                       found = false;
-               }
+       if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+               i = next_zero + 1;
                goto again;
        }
 
@@ -2346,28 +2346,31 @@ again:
                                 &entry->offset_index, 1);
        BUG_ON(ret);
 
+       trace_btrfs_setup_cluster(block_group, cluster,
+                                 total_found * block_group->sectorsize, 1);
        return 0;
 }
 
 /*
  * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
  */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        struct btrfs_free_cluster *cluster,
                        struct list_head *bitmaps, u64 offset, u64 bytes,
-                       u64 min_bytes)
+                       u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
-       struct btrfs_free_space *prev = NULL;
        struct btrfs_free_space *last;
        struct rb_node *node;
        u64 window_start;
        u64 window_free;
        u64 max_extent;
-       u64 max_gap = 128 * 1024;
+       u64 total_size = 0;
 
        entry = tree_search_offset(ctl, offset, 0, 1);
        if (!entry)
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
         * We don't want bitmaps, so just move along until we find a normal
         * extent entry.
         */
-       while (entry->bitmap) {
-               if (list_empty(&entry->list))
+       while (entry->bitmap || entry->bytes < min_bytes) {
+               if (entry->bitmap && list_empty(&entry->list))
                        list_add_tail(&entry->list, bitmaps);
                node = rb_next(&entry->offset_index);
                if (!node)
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
        max_extent = entry->bytes;
        first = entry;
        last = entry;
-       prev = entry;
 
-       while (window_free <= min_bytes) {
-               node = rb_next(&entry->offset_index);
-               if (!node)
-                       return -ENOSPC;
+       for (node = rb_next(&entry->offset_index); node;
+            node = rb_next(&entry->offset_index)) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
 
                if (entry->bitmap) {
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        continue;
                }
 
-               /*
-                * we haven't filled the empty size and the window is
-                * very large.  reset and try again
-                */
-               if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-                   entry->offset - window_start > (min_bytes * 2)) {
-                       first = entry;
-                       window_start = entry->offset;
-                       window_free = entry->bytes;
-                       last = entry;
+               if (entry->bytes < min_bytes)
+                       continue;
+
+               last = entry;
+               window_free += entry->bytes;
+               if (entry->bytes > max_extent)
                        max_extent = entry->bytes;
-               } else {
-                       last = entry;
-                       window_free += entry->bytes;
-                       if (entry->bytes > max_extent)
-                               max_extent = entry->bytes;
-               }
-               prev = entry;
        }
 
+       if (window_free < bytes || max_extent < cont1_bytes)
+               return -ENOSPC;
+
        cluster->window_start = first->offset;
 
        node = &first->offset_index;
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
-               if (entry->bitmap)
+               if (entry->bitmap || entry->bytes < min_bytes)
                        continue;
 
                rb_erase(&entry->offset_index, &ctl->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
+               total_size += entry->bytes;
                BUG_ON(ret);
        } while (node && entry != last);
 
        cluster->max_size = max_extent;
-
+       trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
        return 0;
 }
 
@@ -2460,7 +2453,7 @@ static noinline int
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
                     struct btrfs_free_cluster *cluster,
                     struct list_head *bitmaps, u64 offset, u64 bytes,
-                    u64 min_bytes)
+                    u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
                if (entry->bytes < min_bytes)
                        continue;
                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-                                          bytes, min_bytes);
+                                          bytes, cont1_bytes, min_bytes);
                if (!ret)
                        return 0;
        }
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry, *tmp;
        LIST_HEAD(bitmaps);
        u64 min_bytes;
+       u64 cont1_bytes;
        int ret;
 
-       /* for metadata, allow allocates with more holes */
+       /*
+        * Choose the minimum extent size we'll require for this
+        * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+        * For metadata, allow allocates with smaller extents.  For
+        * data, keep it dense.
+        */
        if (btrfs_test_opt(root, SSD_SPREAD)) {
-               min_bytes = bytes + empty_size;
+               cont1_bytes = min_bytes = bytes + empty_size;
        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-               /*
-                * we want to do larger allocations when we are
-                * flushing out the delayed refs, it helps prevent
-                * making more work as we go along.
-                */
-               if (trans->transaction->delayed_refs.flushing)
-                       min_bytes = max(bytes, (bytes + empty_size) >> 1);
-               else
-                       min_bytes = max(bytes, (bytes + empty_size) >> 4);
-       } else
-               min_bytes = max(bytes, (bytes + empty_size) >> 2);
+               cont1_bytes = bytes;
+               min_bytes = block_group->sectorsize;
+       } else {
+               cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+               min_bytes = block_group->sectorsize;
+       }
 
        spin_lock(&ctl->tree_lock);
 
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
         * If we know we don't have enough space to make a cluster don't even
         * bother doing all the work to try and find one.
         */
-       if (ctl->free_space < min_bytes) {
+       if (ctl->free_space < bytes) {
                spin_unlock(&ctl->tree_lock);
                return -ENOSPC;
        }
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 
+       trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+                                min_bytes);
+
+       INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-                                     bytes, min_bytes);
+                                     bytes + empty_size,
+                                     cont1_bytes, min_bytes);
        if (ret)
                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-                                          offset, bytes, min_bytes);
+                                          offset, bytes + empty_size,
+                                          cont1_bytes, min_bytes);
 
        /* Clear our temporary list */
        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                list_add_tail(&cluster->block_group_list,
                              &block_group->cluster_list);
                cluster->block_group = block_group;
+       } else {
+               trace_btrfs_failed_cluster_setup(block_group);
        }
 out:
        spin_unlock(&cluster->lock);
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        cluster->block_group = NULL;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-                          u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+                      u64 *total_trimmed, u64 start, u64 bytes,
+                      u64 reserved_start, u64 reserved_bytes)
 {
-       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-       struct btrfs_free_space *entry = NULL;
+       struct btrfs_space_info *space_info = block_group->space_info;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
-       u64 bytes = 0;
-       u64 actually_trimmed;
-       int ret = 0;
+       int ret;
+       int update = 0;
+       u64 trimmed = 0;
 
-       *trimmed = 0;
+       spin_lock(&space_info->lock);
+       spin_lock(&block_group->lock);
+       if (!block_group->ro) {
+               block_group->reserved += reserved_bytes;
+               space_info->bytes_reserved += reserved_bytes;
+               update = 1;
+       }
+       spin_unlock(&block_group->lock);
+       spin_unlock(&space_info->lock);
+
+       ret = btrfs_error_discard_extent(fs_info->extent_root,
+                                        start, bytes, &trimmed);
+       if (!ret)
+               *total_trimmed += trimmed;
+
+       btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+       if (update) {
+               spin_lock(&space_info->lock);
+               spin_lock(&block_group->lock);
+               if (block_group->ro)
+                       space_info->bytes_readonly += reserved_bytes;
+               block_group->reserved -= reserved_bytes;
+               space_info->bytes_reserved -= reserved_bytes;
+               spin_unlock(&space_info->lock);
+               spin_unlock(&block_group->lock);
+       }
+
+       return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+                         u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+       struct btrfs_free_space *entry;
+       struct rb_node *node;
+       int ret = 0;
+       u64 extent_start;
+       u64 extent_bytes;
+       u64 bytes;
 
        while (start < end) {
                spin_lock(&ctl->tree_lock);
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                }
 
                entry = tree_search_offset(ctl, start, 0, 1);
-               if (!entry)
-                       entry = tree_search_offset(ctl,
-                                                  offset_to_bitmap(ctl, start),
-                                                  1, 1);
-
-               if (!entry || entry->offset >= end) {
+               if (!entry) {
                        spin_unlock(&ctl->tree_lock);
                        break;
                }
 
-               if (entry->bitmap) {
-                       ret = search_bitmap(ctl, entry, &start, &bytes);
-                       if (!ret) {
-                               if (start >= end) {
-                                       spin_unlock(&ctl->tree_lock);
-                                       break;
-                               }
-                               bytes = min(bytes, end - start);
-                               bitmap_clear_bits(ctl, entry, start, bytes);
-                               if (entry->bytes == 0)
-                                       free_bitmap(ctl, entry);
-                       } else {
-                               start = entry->offset + BITS_PER_BITMAP *
-                                       block_group->sectorsize;
+               /* skip bitmaps */
+               while (entry->bitmap) {
+                       node = rb_next(&entry->offset_index);
+                       if (!node) {
                                spin_unlock(&ctl->tree_lock);
-                               ret = 0;
-                               continue;
+                               goto out;
                        }
-               } else {
-                       start = entry->offset;
-                       bytes = min(entry->bytes, end - start);
-                       unlink_free_space(ctl, entry);
-                       kmem_cache_free(btrfs_free_space_cachep, entry);
+                       entry = rb_entry(node, struct btrfs_free_space,
+                                        offset_index);
                }
 
+               if (entry->offset >= end) {
+                       spin_unlock(&ctl->tree_lock);
+                       break;
+               }
+
+               extent_start = entry->offset;
+               extent_bytes = entry->bytes;
+               start = max(start, extent_start);
+               bytes = min(extent_start + extent_bytes, end) - start;
+               if (bytes < minlen) {
+                       spin_unlock(&ctl->tree_lock);
+                       goto next;
+               }
+
+               unlink_free_space(ctl, entry);
+               kmem_cache_free(btrfs_free_space_cachep, entry);
+
                spin_unlock(&ctl->tree_lock);
 
-               if (bytes >= minlen) {
-                       struct btrfs_space_info *space_info;
-                       int update = 0;
-
-                       space_info = block_group->space_info;
-                       spin_lock(&space_info->lock);
-                       spin_lock(&block_group->lock);
-                       if (!block_group->ro) {
-                               block_group->reserved += bytes;
-                               space_info->bytes_reserved += bytes;
-                               update = 1;
-                       }
-                       spin_unlock(&block_group->lock);
-                       spin_unlock(&space_info->lock);
-
-                       ret = btrfs_error_discard_extent(fs_info->extent_root,
-                                                        start,
-                                                        bytes,
-                                                        &actually_trimmed);
-
-                       btrfs_add_free_space(block_group, start, bytes);
-                       if (update) {
-                               spin_lock(&space_info->lock);
-                               spin_lock(&block_group->lock);
-                               if (block_group->ro)
-                                       space_info->bytes_readonly += bytes;
-                               block_group->reserved -= bytes;
-                               space_info->bytes_reserved -= bytes;
-                               spin_unlock(&space_info->lock);
-                               spin_unlock(&block_group->lock);
-                       }
+               ret = do_trimming(block_group, total_trimmed, start, bytes,
+                                 extent_start, extent_bytes);
+               if (ret)
+                       break;
+next:
+               start += bytes;
 
-                       if (ret)
-                               break;
-                       *trimmed += actually_trimmed;
+               if (fatal_signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+
+               cond_resched();
+       }
+out:
+       return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+                       u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+       struct btrfs_free_space *entry;
+       int ret = 0;
+       int ret2;
+       u64 bytes;
+       u64 offset = offset_to_bitmap(ctl, start);
+
+       while (offset < end) {
+               bool next_bitmap = false;
+
+               spin_lock(&ctl->tree_lock);
+
+               if (ctl->free_space < minlen) {
+                       spin_unlock(&ctl->tree_lock);
+                       break;
+               }
+
+               entry = tree_search_offset(ctl, offset, 1, 0);
+               if (!entry) {
+                       spin_unlock(&ctl->tree_lock);
+                       next_bitmap = true;
+                       goto next;
+               }
+
+               bytes = minlen;
+               ret2 = search_bitmap(ctl, entry, &start, &bytes);
+               if (ret2 || start >= end) {
+                       spin_unlock(&ctl->tree_lock);
+                       next_bitmap = true;
+                       goto next;
+               }
+
+               bytes = min(bytes, end - start);
+               if (bytes < minlen) {
+                       spin_unlock(&ctl->tree_lock);
+                       goto next;
+               }
+
+               bitmap_clear_bits(ctl, entry, start, bytes);
+               if (entry->bytes == 0)
+                       free_bitmap(ctl, entry);
+
+               spin_unlock(&ctl->tree_lock);
+
+               ret = do_trimming(block_group, total_trimmed, start, bytes,
+                                 start, bytes);
+               if (ret)
+                       break;
+next:
+               if (next_bitmap) {
+                       offset += BITS_PER_BITMAP * ctl->unit;
+               } else {
+                       start += bytes;
+                       if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+                               offset += BITS_PER_BITMAP * ctl->unit;
                }
-               start += bytes;
-               bytes = 0;
 
                if (fatal_signal_pending(current)) {
                        ret = -ERESTARTSYS;
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
        return ret;
 }
 
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                          u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+       int ret;
+
+       *trimmed = 0;
+
+       ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+       if (ret)
+               return ret;
+
+       ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+       return ret;
+}
+
 /*
  * Find the left-most item in the cache tree, and then return the
  * smallest inode number in the item.
index f8962a9..213ffa8 100644 (file)
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
                                          trans->bytes_reserved);
        if (ret)
                goto out;
+       trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+                                     trans->bytes_reserved, 1);
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
 out_put:
        iput(inode);
 out_release:
+       trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+                                     trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
        trans->block_rsv = rsv;
index 81b235a..0da19a0 100644 (file)
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root)
 {
+       struct btrfs_block_rsv *block_rsv;
        int ret;
 
        if (!list_empty(&root->orphan_list) ||
            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
                return;
 
+       spin_lock(&root->orphan_lock);
+       if (!list_empty(&root->orphan_list)) {
+               spin_unlock(&root->orphan_lock);
+               return;
+       }
+
+       if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+               spin_unlock(&root->orphan_lock);
+               return;
+       }
+
+       block_rsv = root->orphan_block_rsv;
+       root->orphan_block_rsv = NULL;
+       spin_unlock(&root->orphan_lock);
+
        if (root->orphan_item_inserted &&
            btrfs_root_refs(&root->root_item) > 0) {
                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                root->orphan_item_inserted = 0;
        }
 
-       if (root->orphan_block_rsv) {
-               WARN_ON(root->orphan_block_rsv->size > 0);
-               btrfs_free_block_rsv(root, root->orphan_block_rsv);
-               root->orphan_block_rsv = NULL;
+       if (block_rsv) {
+               WARN_ON(block_rsv->size > 0);
+               btrfs_free_block_rsv(root, block_rsv);
        }
 }
 
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
-                       /*
-                        * Need to hold the imutex for reservation purposes, not
-                        * a huge deal here but I have a WARN_ON in
-                        * btrfs_delalloc_reserve_space to catch offenders.
-                        */
-                       mutex_lock(&inode->i_mutex);
                        ret = btrfs_truncate(inode);
-                       mutex_unlock(&inode->i_mutex);
                } else {
                        nr_unlink++;
                }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                BUG_ON(!root->fs_info->enospc_unlink);
                root->fs_info->enospc_unlink = 0;
        }
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
-       int encoding;
        int ret;
        int err = 0;
        u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                found_type = btrfs_key_type(&found_key);
-               encoding = 0;
 
                if (found_key.objectid != ino)
                        break;
@@ -3072,10 +3078,6 @@ search_again:
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                        extent_type = btrfs_file_extent_type(leaf, fi);
-                       encoding = btrfs_file_extent_compression(leaf, fi);
-                       encoding |= btrfs_file_extent_encryption(leaf, fi);
-                       encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                        u64 num_dec;
                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-                       if (!del_item && !encoding) {
+                       if (!del_item) {
                                u64 orig_num_bytes =
                                        btrfs_file_extent_num_bytes(leaf, fi);
                                extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                               ino, extent_offset);
+                                               ino, extent_offset, 0);
                        BUG_ON(ret);
                }
 
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                i_size_write(inode, newsize);
                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                ret = btrfs_update_inode(trans, root, inode);
-               btrfs_end_transaction_throttle(trans, root);
+               btrfs_end_transaction(trans, root);
        } else {
 
                /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
 
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
 fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 out_fail:
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
-                       WARN_ON(1);
+                       BUG();
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
 
-       /* Need this to keep space reservations serialized */
-       mutex_lock(&inode->i_mutex);
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-       mutex_unlock(&inode->i_mutex);
        if (!ret)
                ret = btrfs_update_time(vma->vm_file);
        if (ret) {
@@ -6494,8 +6493,8 @@ out_unlock:
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
-       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
        return ret;
 }
 
@@ -6668,7 +6667,7 @@ end_trans:
                        err = ret;
 
                nr = trans->blocks_used;
-               ret = btrfs_end_transaction_throttle(trans, root);
+               ret = btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(root, nr);
        }
 
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        mutex_init(&ei->log_mutex);
+       mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                btrfs_end_log_trans(root);
        }
 out_fail:
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
 out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7246,7 @@ out_unlock:
        if (!err)
                d_instantiate(dentry, inode);
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
index 5441ff1..ab62001 100644 (file)
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        struct btrfs_trans_handle *trans;
        unsigned int flags, oldflags;
        int ret;
+       u64 ip_oldflags;
+       unsigned int i_oldflags;
 
        if (btrfs_root_readonly(root))
                return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
        mutex_lock(&inode->i_mutex);
 
+       ip_oldflags = ip->flags;
+       i_oldflags = inode->i_flags;
+
        flags = btrfs_mask_flags(inode->i_mode, flags);
        oldflags = btrfs_flags_to_ioctl(ip->flags);
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
 
-       trans = btrfs_join_transaction(root);
-       BUG_ON(IS_ERR(trans));
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_drop;
+       }
 
        btrfs_update_iflags(inode);
        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
-       BUG_ON(ret);
 
        btrfs_end_transaction(trans, root);
+ out_drop:
+       if (ret) {
+               ip->flags = ip_oldflags;
+               inode->i_flags = i_oldflags;
+       }
 
        mnt_drop_write_file(file);
-
-       ret = 0;
  out_unlock:
        mutex_unlock(&inode->i_mutex);
        return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-       struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
        struct btrfs_device *device;
        struct request_queue *q;
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
-       u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+       u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        int ret;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
-       ret = btrfs_trim_fs(root, &range);
+       ret = btrfs_trim_fs(fs_info->tree_root, &range);
        if (ret < 0)
                return ret;
 
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
                return PTR_ERR(trans);
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                     0, objectid, NULL, 0, 0, 0);
+                                     0, objectid, NULL, 0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
                return 0;
        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
 
-       mutex_lock(&inode->i_mutex);
        ret = btrfs_delalloc_reserve_space(inode,
                                           num_pages << PAGE_CACHE_SHIFT);
-       mutex_unlock(&inode->i_mutex);
        if (ret)
                return ret;
 again:
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-       mutex_lock(&root->fs_info->volume_mutex);
        sizestr = vol_args->name;
        devstr = strchr(sizestr, ':');
        if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
-               goto out_unlock;
+               goto out_free;
        }
        if (!strcmp(sizestr, "max"))
                new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                new_size = memparse(sizestr, NULL);
                if (new_size == 0) {
                        ret = -EINVAL;
-                       goto out_unlock;
+                       goto out_free;
                }
        }
 
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (mod < 0) {
                if (new_size > old_size) {
                        ret = -EINVAL;
-                       goto out_unlock;
+                       goto out_free;
                }
                new_size = old_size - new_size;
        } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
        if (new_size < 256 * 1024 * 1024) {
                ret = -EINVAL;
-               goto out_unlock;
+               goto out_free;
        }
        if (new_size > device->bdev->bd_inode->i_size) {
                ret = -EFBIG;
-               goto out_unlock;
+               goto out_free;
        }
 
        do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
-                       goto out_unlock;
+                       goto out_free;
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                ret = btrfs_shrink_device(device, new_size);
        }
 
-out_unlock:
-       mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
 
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
 
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                                        disko, diskl, 0,
                                                        root->root_key.objectid,
                                                        btrfs_ino(inode),
-                                                       new_key.offset - datao);
+                                                       new_key.offset - datao,
+                                                       0);
                                        BUG_ON(ret);
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
        int ret = 0;
        int size;
-       u64 extent_offset;
+       u64 extent_item_pos;
        struct btrfs_ioctl_logical_ino_args *loi;
        struct btrfs_data_container *inodes = NULL;
        struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
        }
 
        ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+       btrfs_release_path(path);
 
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -ENOENT;
        if (ret < 0)
                goto out;
 
-       extent_offset = loi->logical - key.objectid;
+       extent_item_pos = loi->logical - key.objectid;
        ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-                                       extent_offset, build_ino_list, inodes);
+                                       extent_item_pos, build_ino_list,
+                                       inodes);
 
        if (ret < 0)
                goto out;
@@ -3034,6 +3075,163 @@ out:
        return ret;
 }
 
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+       bargs->flags = bctl->flags;
+
+       if (atomic_read(&fs_info->balance_running))
+               bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+       if (atomic_read(&fs_info->balance_pause_req))
+               bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+       if (atomic_read(&fs_info->balance_cancel_req))
+               bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+       memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+       memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+       memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+       if (lock) {
+               spin_lock(&fs_info->balance_lock);
+               memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+               spin_unlock(&fs_info->balance_lock);
+       } else {
+               memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+       }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ioctl_balance_args *bargs;
+       struct btrfs_balance_control *bctl;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+
+       if (arg) {
+               bargs = memdup_user(arg, sizeof(*bargs));
+               if (IS_ERR(bargs)) {
+                       ret = PTR_ERR(bargs);
+                       goto out;
+               }
+
+               if (bargs->flags & BTRFS_BALANCE_RESUME) {
+                       if (!fs_info->balance_ctl) {
+                               ret = -ENOTCONN;
+                               goto out_bargs;
+                       }
+
+                       bctl = fs_info->balance_ctl;
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->flags |= BTRFS_BALANCE_RESUME;
+                       spin_unlock(&fs_info->balance_lock);
+
+                       goto do_balance;
+               }
+       } else {
+               bargs = NULL;
+       }
+
+       if (fs_info->balance_ctl) {
+               ret = -EINPROGRESS;
+               goto out_bargs;
+       }
+
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out_bargs;
+       }
+
+       bctl->fs_info = fs_info;
+       if (arg) {
+               memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+               memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+               memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+               bctl->flags = bargs->flags;
+       } else {
+               /* balance everything - no filters */
+               bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+       }
+
+do_balance:
+       ret = btrfs_balance(bctl, bargs);
+       /*
+        * bctl is freed in __cancel_balance or in free_fs_info if
+        * restriper was paused all the way until unmount
+        */
+       if (arg) {
+               if (copy_to_user(arg, bargs, sizeof(*bargs)))
+                       ret = -EFAULT;
+       }
+
+out_bargs:
+       kfree(bargs);
+out:
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+       case BTRFS_BALANCE_CTL_PAUSE:
+               return btrfs_pause_balance(root->fs_info);
+       case BTRFS_BALANCE_CTL_CANCEL:
+               return btrfs_cancel_balance(root->fs_info);
+       }
+
+       return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+                                        void __user *arg)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ioctl_balance_args *bargs;
+       int ret = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               ret = -ENOTCONN;
+               goto out;
+       }
+
+       bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+       if (!bargs) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       update_ioctl_balance_args(fs_info, 1, bargs);
+
+       if (copy_to_user(arg, bargs, sizeof(*bargs)))
+               ret = -EFAULT;
+
+       kfree(bargs);
+out:
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEV_INFO:
                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
-               return btrfs_balance(root->fs_info->dev_root);
+               return btrfs_ioctl_balance(root, NULL);
        case BTRFS_IOC_CLONE:
                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
        case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
                return btrfs_ioctl_scrub_progress(root, argp);
+       case BTRFS_IOC_BALANCE_V2:
+               return btrfs_ioctl_balance(root, argp);
+       case BTRFS_IOC_BALANCE_CTL:
+               return btrfs_ioctl_balance_ctl(root, arg);
+       case BTRFS_IOC_BALANCE_PROGRESS:
+               return btrfs_ioctl_balance_progress(root, argp);
        }
 
        return -ENOTTY;
index 252ae99..4f69028 100644 (file)
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
        __u64 reserved[124];                    /* pad to 1k */
 };
 
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE                1
+#define BTRFS_BALANCE_CTL_CANCEL       2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+       __u64 profiles;
+       __u64 usage;
+       __u64 devid;
+       __u64 pstart;
+       __u64 pend;
+       __u64 vstart;
+       __u64 vend;
+
+       __u64 target;
+
+       __u64 flags;
+
+       __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+       __u64 expected;         /* estimated # of chunks that will be
+                                * relocated to fulfill the request */
+       __u64 considered;       /* # of chunks we have considered so far */
+       __u64 completed;        /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING    (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ  (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+       __u64 flags;                            /* in/out */
+       __u64 state;                            /* out */
+
+       struct btrfs_balance_args data;         /* in/out */
+       struct btrfs_balance_args meta;         /* in/out */
+       struct btrfs_balance_args sys;          /* in/out */
+
+       struct btrfs_balance_progress stat;     /* out */
+
+       __u64 unused[72];                       /* pad to 1k */
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
                                 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
                               struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+                                  struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+                                       struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
                                        struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
index d77b67c..5e178d8 100644 (file)
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
  */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (eb->lock_nested && current->pid == eb->lock_owner) {
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        if (rw == BTRFS_WRITE_LOCK) {
                if (atomic_read(&eb->blocking_writers) == 0) {
                        WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
  */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (&eb->lock_nested && current->pid == eb->lock_owner) {
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
                BUG_ON(atomic_read(&eb->blocking_writers) != 1);
                write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
+       read_lock(&eb->lock);
+       if (atomic_read(&eb->blocking_writers) &&
+           current->pid == eb->lock_owner) {
+               /*
+                * This extent is already write-locked by our thread. We allow
+                * an additional read lock to be added because it's for the same
+                * thread. btrfs_find_all_roots() depends on this as it may be
+                * called on a partly (write-)locked tree.
+                */
+               BUG_ON(eb->lock_nested);
+               eb->lock_nested = 1;
+               read_unlock(&eb->lock);
+               return;
+       }
+       read_unlock(&eb->lock);
        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
        read_lock(&eb->lock);
        if (atomic_read(&eb->blocking_writers)) {
                read_unlock(&eb->lock);
-               wait_event(eb->write_lock_wq,
-                          atomic_read(&eb->blocking_writers) == 0);
                goto again;
        }
        atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
        }
        atomic_inc(&eb->write_locks);
        atomic_inc(&eb->spinning_writers);
+       eb->lock_owner = current->pid;
        return 1;
 }
 
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (eb->lock_nested && current->pid == eb->lock_owner) {
+                       eb->lock_nested = 0;
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->spinning_readers) == 0);
        atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (eb->lock_nested && current->pid == eb->lock_owner) {
+                       eb->lock_nested = 0;
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->blocking_readers) == 0);
        if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
        WARN_ON(atomic_read(&eb->spinning_writers));
        atomic_inc(&eb->spinning_writers);
        atomic_inc(&eb->write_locks);
+       eb->lock_owner = current->pid;
        return 0;
 }
 
index cfb5543..8c1aae2 100644 (file)
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
                                           num_bytes, parent,
                                           btrfs_header_owner(leaf),
-                                          key.objectid, key.offset);
+                                          key.objectid, key.offset, 1);
                BUG_ON(ret);
 
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        parent, btrfs_header_owner(leaf),
-                                       key.objectid, key.offset);
+                                       key.objectid, key.offset, 1);
                BUG_ON(ret);
        }
        if (dirty)
@@ -1778,21 +1778,23 @@ again:
 
                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
                                        path->nodes[level]->start,
-                                       src->root_key.objectid, level - 1, 0);
+                                       src->root_key.objectid, level - 1, 0,
+                                       1);
                BUG_ON(ret);
                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
                                        0, dest->root_key.objectid, level - 1,
-                                       0);
+                                       0, 1);
                BUG_ON(ret);
 
                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
                                        path->nodes[level]->start,
-                                       src->root_key.objectid, level - 1, 0);
+                                       src->root_key.objectid, level - 1, 0,
+                                       1);
                BUG_ON(ret);
 
                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
                                        0, dest->root_key.objectid, level - 1,
-                                       0);
+                                       0, 1);
                BUG_ON(ret);
 
                btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
                } else {
                        list_del_init(&reloc_root->root_list);
                }
-               btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+               btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
        }
 
        if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                                                node->eb->start, blocksize,
                                                upper->eb->start,
                                                btrfs_header_owner(upper->eb),
-                                               node->level, 0);
+                                               node->level, 0, 1);
                        BUG_ON(ret);
 
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
-               mutex_lock(&inode->i_mutex);
                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
-               mutex_unlock(&inode->i_mutex);
                if (ret)
                        goto out;
 
index ddf2c90..9770cc5 100644 (file)
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "check-integrity.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        u8 ref_level;
        unsigned long ptr = 0;
        const int bufsize = 4096;
-       u64 extent_offset;
+       u64 extent_item_pos;
 
        path = btrfs_alloc_path();
 
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        if (ret < 0)
                goto out;
 
-       extent_offset = swarn.logical - found_key.objectid;
+       extent_item_pos = swarn.logical - found_key.objectid;
        swarn.extent_item_size = found_key.offset;
 
        eb = path->nodes[0];
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+       btrfs_release_path(path);
 
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        } else {
                swarn.path = path;
                iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_offset,
+                                       extent_item_pos,
                                        scrub_print_warning_inode, &swarn);
        }
 
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        bio_add_page(bio, page, PAGE_SIZE, 0);
        bio->bi_end_io = scrub_fixup_end_io;
        bio->bi_private = &complete;
-       submit_bio(rw, bio);
+       btrfsic_submit_bio(rw, bio);
 
        /* this will also unplug the queue */
        wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
 
-       submit_bio(READ, sbio->bio);
+       btrfsic_submit_bio(READ, sbio->bio);
 
        return 0;
 }
index ae488aa..3ce97b2 100644 (file)
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
-       int ret;
-
-       ret = close_ctree(root);
-       sb->s_fs_info = NULL;
-
-       (void)ret; /* FIXME: need to fix VFS to return error? */
+       (void)close_ctree(btrfs_sb(sb)->tree_root);
+       /* FIXME: need to fix VFS to return error? */
+       /* AV: return it _where_?  ->put_super() can be triggered by any number
+        * of async events, up to and including delivery of SIGKILL to the
+        * last process that kept it busy.  Or segfault in the aforementioned
+        * process...  Whom would you report that to?
+        */
 }
 
 enum {
@@ -163,8 +163,11 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-       Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_check_integrity_print_mask,
+       Opt_err,
 };
 
 static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
        {Opt_inode_cache, "inode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
+       {Opt_skip_balance, "skip_balance"},
+       {Opt_check_integrity, "check_int"},
+       {Opt_check_integrity_including_extent_data, "check_int_data"},
+       {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
        {Opt_err, NULL},
 };
 
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
+               case Opt_skip_balance:
+                       btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+                       break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+               case Opt_check_integrity_including_extent_data:
+                       printk(KERN_INFO "btrfs: enabling check integrity"
+                              " including extent data\n");
+                       btrfs_set_opt(info->mount_opt,
+                                     CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity:
+                       printk(KERN_INFO "btrfs: enabling check integrity\n");
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity_print_mask:
+                       intarg = 0;
+                       match_int(&args[0], &intarg);
+                       if (intarg) {
+                               info->check_integrity_print_mask = intarg;
+                               printk(KERN_INFO "btrfs:"
+                                      " check_integrity_print_mask 0x%x\n",
+                                      info->check_integrity_print_mask);
+                       }
+                       break;
+#else
+               case Opt_check_integrity_including_extent_data:
+               case Opt_check_integrity:
+               case Opt_check_integrity_print_mask:
+                       printk(KERN_ERR "btrfs: support for check_integrity*"
+                              " not compiled in!\n");
+                       ret = -EINVAL;
+                       goto out;
+#endif
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -500,7 +541,8 @@ out:
 static struct dentry *get_default_root(struct super_block *sb,
                                       u64 subvol_objectid)
 {
-       struct btrfs_root *root = sb->s_fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_root *new_root;
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
-       dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+       dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
                 */
                btrfs_free_path(path);
                dir_id = BTRFS_FIRST_FREE_OBJECTID;
-               new_root = root->fs_info->fs_root;
+               new_root = fs_info->fs_root;
                goto setup_root;
        }
 
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
        btrfs_free_path(path);
 
 find_root:
-       new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+       new_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
 
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *root_dentry;
-       struct btrfs_root *tree_root;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_key key;
        int err;
 
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_flags |= MS_POSIXACL;
 #endif
 
-       tree_root = open_ctree(sb, fs_devices, (char *)data);
-
-       if (IS_ERR(tree_root)) {
+       err = open_ctree(sb, fs_devices, (char *)data);
+       if (err) {
                printk("btrfs: open_ctree failed\n");
-               return PTR_ERR(tree_root);
+               return err;
        }
-       sb->s_fs_info = tree_root;
 
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-       inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+       inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
 
        save_mount_options(sb, data);
        cleancache_init_fs(sb);
+       sb->s_flags |= MS_ACTIVE;
        return 0;
 
 fail_close:
-       close_ctree(tree_root);
+       close_ctree(fs_info->tree_root);
        return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
        struct btrfs_trans_handle *trans;
-       struct btrfs_root *root = btrfs_sb(sb);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root = fs_info->tree_root;
        int ret;
 
        trace_btrfs_sync_fs(wait);
 
        if (!wait) {
-               filemap_flush(root->fs_info->btree_inode->i_mapping);
+               filemap_flush(fs_info->btree_inode->i_mapping);
                return 0;
        }
 
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-       struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+       struct btrfs_root *root = info->tree_root;
        char *compress_type;
 
        if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(root, INODE_MAP_CACHE))
                seq_puts(seq, ",inode_cache");
+       if (btrfs_test_opt(root, SKIP_BALANCE))
+               seq_puts(seq, ",skip_balance");
        return 0;
 }
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-       struct btrfs_root *test_root = data;
-       struct btrfs_root *root = btrfs_sb(s);
+       struct btrfs_fs_info *p = data;
+       struct btrfs_fs_info *fs_info = btrfs_sb(s);
 
-       /*
-        * If this super block is going away, return false as it
-        * can't match as an existing super block.
-        */
-       if (!atomic_read(&s->s_active))
-               return 0;
-       return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+       return fs_info->fs_devices == p->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-       s->s_fs_info = data;
-
-       return set_anon_super(s, data);
+       int err = set_anon_super(s, data);
+       if (!err)
+               s->s_fs_info = data;
+       return err;
 }
 
 /*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (!fs_info)
                return ERR_PTR(-ENOMEM);
 
-       fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       if (!fs_info->tree_root) {
-               error = -ENOMEM;
-               goto error_fs_info;
-       }
-       fs_info->tree_root->fs_info = fs_info;
        fs_info->fs_devices = fs_devices;
 
        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        }
 
        bdev = fs_devices->latest_bdev;
-       s = sget(fs_type, btrfs_test_super, btrfs_set_super,
-                fs_info->tree_root);
+       s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto error_close_devices;
        }
 
        if (s->s_root) {
-               if ((flags ^ s->s_flags) & MS_RDONLY) {
-                       deactivate_locked_super(s);
-                       error = -EBUSY;
-                       goto error_close_devices;
-               }
-
                btrfs_close_devices(fs_devices);
                free_fs_info(fs_info);
+               if ((flags ^ s->s_flags) & MS_RDONLY)
+                       error = -EBUSY;
        } else {
                char b[BDEVNAME_SIZE];
 
                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-               btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+               btrfs_sb(s)->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
-               if (error) {
-                       deactivate_locked_super(s);
-                       return ERR_PTR(error);
-               }
-
-               s->s_flags |= MS_ACTIVE;
        }
 
-       root = get_default_root(s, subvol_objectid);
-       if (IS_ERR(root)) {
+       root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+       if (IS_ERR(root))
                deactivate_locked_super(s);
-               return root;
-       }
 
        return root;
 
@@ -977,7 +997,8 @@ error_fs_info:
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root = fs_info->tree_root;
        int ret;
 
        ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                ret =  btrfs_commit_super(root);
                WARN_ON(ret);
        } else {
-               if (root->fs_info->fs_devices->rw_devices == 0)
+               if (fs_info->fs_devices->rw_devices == 0)
                        return -EACCES;
 
-               if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
+               if (btrfs_super_log_root(fs_info->super_copy) != 0)
                        return -EINVAL;
 
-               ret = btrfs_cleanup_fs_roots(root->fs_info);
+               ret = btrfs_cleanup_fs_roots(fs_info);
                WARN_ON(ret);
 
                /* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-       struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-       struct btrfs_super_block *disk_super = root->fs_info->super_copy;
-       struct list_head *head = &root->fs_info->space_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+       struct btrfs_super_block *disk_super = fs_info->super_copy;
+       struct list_head *head = &fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
-       __be32 *fsid = (__be32 *)root->fs_info->fsid;
+       __be32 *fsid = (__be32 *)fs_info->fsid;
        int ret;
 
        /* holding chunk_muext to avoid allocating new chunks */
-       mutex_lock(&root->fs_info->chunk_mutex);
+       mutex_lock(&fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bavail = total_free_data;
-       ret = btrfs_calc_avail_data_space(root, &total_free_data);
+       ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
-               mutex_unlock(&root->fs_info->chunk_mutex);
+               mutex_unlock(&fs_info->chunk_mutex);
                return ret;
        }
        buf->f_bavail += total_free_data;
        buf->f_bavail = buf->f_bavail >> bits;
-       mutex_unlock(&root->fs_info->chunk_mutex);
+       mutex_unlock(&fs_info->chunk_mutex);
 
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
 
+static void btrfs_kill_super(struct super_block *sb)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       kill_anon_super(sb);
+       free_fs_info(fs_info);
+}
+
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
        .mount          = btrfs_mount,
-       .kill_sb        = kill_anon_super,
+       .kill_sb        = btrfs_kill_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
-       mutex_lock(&root->fs_info->transaction_kthread_mutex);
-       mutex_lock(&root->fs_info->cleaner_mutex);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       mutex_lock(&fs_info->transaction_kthread_mutex);
+       mutex_lock(&fs_info->cleaner_mutex);
        return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
-       mutex_unlock(&root->fs_info->cleaner_mutex);
-       mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       mutex_unlock(&fs_info->cleaner_mutex);
+       mutex_unlock(&fs_info->transaction_kthread_mutex);
        return 0;
 }
 
index 81376d9..287a672 100644 (file)
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
+               WARN_ON(transaction->delayed_refs.root.rb_node);
+               WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -108,8 +110,11 @@ loop:
        cur_trans->delayed_refs.num_heads = 0;
        cur_trans->delayed_refs.flushing = 0;
        cur_trans->delayed_refs.run_delayed_start = 0;
+       cur_trans->delayed_refs.seq = 1;
+       init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
+       INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
        }
 
        if (num_bytes) {
+               trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                             (u64)h, num_bytes, 1);
                h->block_rsv = &root->fs_info->trans_block_rsv;
                h->bytes_reserved = num_bytes;
        }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
-       while (count < 4) {
+       while (count < 2) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
                if (cur &&
                    trans->transaction->delayed_refs.num_heads_ready > 64) {
                        trans->delayed_ref_updates = 0;
-
-                       /*
-                        * do a full flush if the transaction is trying
-                        * to close
-                        */
-                       if (trans->transaction->delayed_refs.flushing)
-                               cur = 0;
                        btrfs_run_delayed_refs(trans, root, cur);
                } else {
                        break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                       btrfs_drop_snapshot(root, NULL, 0);
+                       btrfs_drop_snapshot(root, NULL, 0, 0);
                else
-                       btrfs_drop_snapshot(root, NULL, 1);
+                       btrfs_drop_snapshot(root, NULL, 1, 0);
        }
        return 0;
 }
index 3568374..cb877e0 100644 (file)
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
                                                0, root->root_key.objectid,
-                                               key->objectid, offset);
+                                               key->objectid, offset, 0);
                                BUG_ON(ret);
                        } else {
                                /*
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644 (file)
index 0000000..12f5147
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ *     for (all child nodes n in elem)
+ *             ulist_add(ulist, n);
+ *     do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:     the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+       ulist->nnodes = 0;
+       ulist->nodes = ulist->int_nodes;
+       ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:     the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+       /*
+        * The first ULIST_SIZE elements are stored inline in struct ulist.
+        * Only if more elements are alocated they need to be freed.
+        */
+       if (ulist->nodes_alloced > ULIST_SIZE)
+               kfree(ulist->nodes);
+       ulist->nodes_alloced = 0;       /* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:     ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+       ulist_fini(ulist);
+       ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:  allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+       struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+       if (!ulist)
+               return NULL;
+
+       ulist_init(ulist);
+
+       return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:     ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+       if (!ulist)
+               return;
+       ulist_fini(ulist);
+       kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:     ulist to add the element to
+ * @val:       value to add to ulist
+ * @aux:       auxiliary value to store along with val
+ * @gfp_mask:  flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+             unsigned long gfp_mask)
+{
+       int i;
+
+       for (i = 0; i < ulist->nnodes; ++i) {
+               if (ulist->nodes[i].val == val)
+                       return 0;
+       }
+
+       if (ulist->nnodes >= ulist->nodes_alloced) {
+               u64 new_alloced = ulist->nodes_alloced + 128;
+               struct ulist_node *new_nodes;
+               void *old = NULL;
+
+               /*
+                * if nodes_alloced == ULIST_SIZE no memory has been allocated
+                * yet, so pass NULL to krealloc
+                */
+               if (ulist->nodes_alloced > ULIST_SIZE)
+                       old = ulist->nodes;
+
+               new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+                                    gfp_mask);
+               if (!new_nodes)
+                       return -ENOMEM;
+
+               if (!old)
+                       memcpy(new_nodes, ulist->int_nodes,
+                              sizeof(ulist->int_nodes));
+
+               ulist->nodes = new_nodes;
+               ulist->nodes_alloced = new_alloced;
+       }
+       ulist->nodes[ulist->nnodes].val = val;
+       ulist->nodes[ulist->nnodes].aux = aux;
+       ++ulist->nnodes;
+
+       return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist:     ulist to iterate
+ * @prev:      previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+       int next;
+
+       if (ulist->nnodes == 0)
+               return NULL;
+
+       if (!prev)
+               return &ulist->nodes[0];
+
+       next = (prev - ulist->nodes) + 1;
+       if (next < 0 || next >= ulist->nnodes)
+               return NULL;
+
+       return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644 (file)
index 0000000..2e25dec
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+       u64 val;                /* value to store */
+       unsigned long aux;      /* auxiliary value saved along with the val */
+};
+
+struct ulist {
+       /*
+        * number of elements stored in list
+        */
+       unsigned long nnodes;
+
+       /*
+        * number of nodes we already have room for
+        */
+       unsigned long nodes_alloced;
+
+       /*
+        * pointer to the array storing the elements. The first ULIST_SIZE
+        * elements are stored inline. In this case the it points to int_nodes.
+        * After exceeding ULIST_SIZE, dynamic memory is allocated.
+        */
+       struct ulist_node *nodes;
+
+       /*
+        * inline storage space for the first ULIST_SIZE entries
+        */
+       struct ulist_node int_nodes[ULIST_SIZE];
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+             unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+
+#endif
index f4b839f..0b4e2af 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -32,6 +33,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
+#include "check-integrity.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
                        sync_pending = 0;
                }
 
-               submit_bio(cur->bi_rw, cur);
+               btrfsic_submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
                if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        u64 devid;
        u64 transid;
 
-       mutex_lock(&uuid_mutex);
-
        flags |= FMODE_EXCL;
        bdev = blkdev_get_by_path(path, flags, holder);
 
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error;
        }
 
+       mutex_lock(&uuid_mutex);
        ret = set_blocksize(bdev, 4096);
        if (ret)
                goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
        brelse(bh);
 error_close:
+       mutex_unlock(&uuid_mutex);
        blkdev_put(bdev, flags);
 error:
-       mutex_unlock(&uuid_mutex);
        return ret;
 }
 
@@ -829,7 +830,6 @@ out:
 
 /*
  * find_free_dev_extent - find free space in the specified device
- * @trans:     transaction handler
  * @device:    the device which we search the free space in
  * @num_bytes: the size of the free space that we need
  * @start:     store the start of the free space.
@@ -848,8 +848,7 @@ out:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                        struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
 
-       ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        bool clear_super = false;
 
        mutex_lock(&uuid_mutex);
-       mutex_lock(&root->fs_info->volume_mutex);
 
        all_avail = root->fs_info->avail_data_alloc_bits |
                root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-       mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
 error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
 {
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
 
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
-       mutex_lock(&root->fs_info->volume_mutex);
 
        devices = &root->fs_info->fs_devices->devices;
        /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
        if (seeding_dev) {
                sb->s_flags &= ~MS_RDONLY;
-               ret = btrfs_prepare_sprout(trans, root);
+               ret = btrfs_prepare_sprout(root);
                BUG_ON(ret);
        }
 
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                ret = btrfs_relocate_sys_chunks(root);
                BUG_ON(ret);
        }
-out:
-       mutex_unlock(&root->fs_info->volume_mutex);
+
        return ret;
 error:
        blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
-       goto out;
+       return ret;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
        return ret;
 }
 
+static int insert_balance_item(struct btrfs_root *root,
+                              struct btrfs_balance_control *bctl)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret, err;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     sizeof(*item));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+       memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+       btrfs_set_balance_data(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+       btrfs_set_balance_meta(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+       btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+       btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+       btrfs_mark_buffer_dirty(leaf);
+out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret, err;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, root, path);
+out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+       /*
+        * Turn on soft mode for chunk types that were being converted.
+        */
+       if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+       /*
+        * Turn on usage filter if is not already used.  The idea is
+        * that chunks that we have already balanced should be
+        * reasonably full.  Don't do it for chunks that are being
+        * converted - that will keep us from relocating unconverted
+        * (albeit full) chunks.
+        */
+       if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->data.usage = 90;
+       }
+       if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->sys.usage = 90;
+       }
+       if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->meta.usage = 90;
+       }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+       BUG_ON(fs_info->balance_ctl);
+
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = bctl;
+       spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+       BUG_ON(!fs_info->balance_ctl);
+
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = NULL;
+       spin_unlock(&fs_info->balance_lock);
+
+       kfree(bctl);
+}
+
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+                                struct btrfs_balance_args *bargs)
+{
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (bargs->profiles & chunk_profile)
+               return 0;
+
+       return 1;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+       if (factor <= 0)
+               return 0;
+       if (factor >= 100)
+               return num;
+
+       num *= factor;
+       do_div(num, 100);
+       return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+                             struct btrfs_balance_args *bargs)
+{
+       struct btrfs_block_group_cache *cache;
+       u64 chunk_used, user_thresh;
+       int ret = 1;
+
+       cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+       chunk_used = btrfs_block_group_used(&cache->item);
+
+       user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+       if (chunk_used < user_thresh)
+               ret = 0;
+
+       btrfs_put_block_group(cache);
+       return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+                             struct btrfs_chunk *chunk,
+                             struct btrfs_balance_args *bargs)
+{
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       int i;
+
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       u64 stripe_offset;
+       u64 stripe_length;
+       int factor;
+       int i;
+
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+               return 0;
+
+       if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+            BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
+       factor = num_stripes / factor;
+
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+                       continue;
+
+               stripe_offset = btrfs_stripe_offset(leaf, stripe);
+               stripe_length = btrfs_chunk_length(leaf, chunk);
+               do_div(stripe_length, factor);
+
+               if (stripe_offset < bargs->pend &&
+                   stripe_offset + stripe_length > bargs->pstart)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+       if (chunk_offset < bargs->vend &&
+           chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+               /* at least part of the chunk is inside this vrange */
+               return 0;
+
+       return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_profile,
+                                    struct btrfs_balance_args *bargs)
+{
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+               return 0;
+
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (bargs->target & chunk_profile)
+               return 1;
+
+       return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+                               struct extent_buffer *leaf,
+                               struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+       struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+       struct btrfs_balance_args *bargs = NULL;
+       u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+       /* type filter */
+       if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+             (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+               return 0;
+       }
+
+       if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+               bargs = &bctl->data;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+               bargs = &bctl->sys;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+               bargs = &bctl->meta;
+
+       /* profiles filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+           chunk_profiles_filter(chunk_type, bargs)) {
+               return 0;
+       }
+
+       /* usage filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+           chunk_devid_filter(leaf, chunk, bargs)) {
+               return 0;
+       }
+
+       /* drange filter, makes sense only with devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+           chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* vrange filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+           chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* soft profile changing mode */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+           chunk_soft_convert_filter(chunk_type, bargs)) {
+               return 0;
+       }
+
+       return 1;
+}
+
 static u64 div_factor(u64 num, int factor)
 {
        if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
        return num;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
-       int ret;
-       struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       struct btrfs_root *chunk_root = fs_info->chunk_root;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct list_head *devices;
        struct btrfs_device *device;
        u64 old_size;
        u64 size_to_free;
+       struct btrfs_chunk *chunk;
        struct btrfs_path *path;
        struct btrfs_key key;
-       struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-       struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
-
-       if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       mutex_lock(&dev_root->fs_info->volume_mutex);
-       dev_root = dev_root->fs_info->dev_root;
+       struct btrfs_trans_handle *trans;
+       struct extent_buffer *leaf;
+       int slot;
+       int ret;
+       int enospc_errors = 0;
+       bool counting = true;
 
        /* step one make some room on all the devices */
+       devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                old_size = device->total_bytes;
                size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
                ret = -ENOMEM;
                goto error;
        }
+
+       /* zero out stat counters */
+       spin_lock(&fs_info->balance_lock);
+       memset(&bctl->stat, 0, sizeof(bctl->stat));
+       spin_unlock(&fs_info->balance_lock);
+again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
 
        while (1) {
+               if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+                   atomic_read(&fs_info->balance_cancel_req)) {
+                       ret = -ECANCELED;
+                       goto error;
+               }
+
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0)
                        goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 * failed
                 */
                if (ret == 0)
-                       break;
+                       BUG(); /* FIXME break ? */
 
                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
-               if (ret)
+               if (ret) {
+                       ret = 0;
                        break;
+               }
+
+               leaf = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     path->slots[0]);
                if (found_key.objectid != key.objectid)
                        break;
 
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.offset == 0)
                        break;
 
+               chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+               if (!counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.considered++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+
+               ret = should_balance_chunk(chunk_root, leaf, chunk,
+                                          found_key.offset);
                btrfs_release_path(path);
+               if (!ret)
+                       goto loop;
+
+               if (counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.expected++;
+                       spin_unlock(&fs_info->balance_lock);
+                       goto loop;
+               }
+
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
                        goto error;
+               if (ret == -ENOSPC) {
+                       enospc_errors++;
+               } else {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.completed++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+loop:
                key.offset = found_key.offset - 1;
        }
-       ret = 0;
+
+       if (counting) {
+               btrfs_release_path(path);
+               counting = false;
+               goto again;
+       }
 error:
        btrfs_free_path(path);
-       mutex_unlock(&dev_root->fs_info->volume_mutex);
+       if (enospc_errors) {
+               printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+                      enospc_errors);
+               if (!ret)
+                       ret = -ENOSPC;
+       }
+
        return ret;
 }
 
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+       /* cancel requested || normal exit path */
+       return atomic_read(&fs_info->balance_cancel_req) ||
+               (atomic_read(&fs_info->balance_pause_req) == 0 &&
+                atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+       int ret;
+
+       unset_balance_control(fs_info);
+       ret = del_balance_item(fs_info->tree_root);
+       BUG_ON(ret);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs)
+{
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       u64 allowed;
+       int ret;
+
+       if (btrfs_fs_closing(fs_info) ||
+           atomic_read(&fs_info->balance_pause_req) ||
+           atomic_read(&fs_info->balance_cancel_req)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * In case of mixed groups both data and meta should be picked,
+        * and identical options should be given for both of them.
+        */
+       allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+       if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+           (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+               if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+                   !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+                   memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+                       printk(KERN_ERR "btrfs: with mixed groups data and "
+                              "metadata balance options must be the same\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * Profile changing sanity checks.  Skip them if a simple
+        * balance is requested.
+        */
+       if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+             BTRFS_BALANCE_ARGS_CONVERT))
+               goto do_balance;
+
+       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (fs_info->fs_devices->num_devices == 1)
+               allowed |= BTRFS_BLOCK_GROUP_DUP;
+       else if (fs_info->fs_devices->num_devices < 4)
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+       else
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10);
+
+       if (!profile_is_valid(bctl->data.target, 1) ||
+           bctl->data.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "data profile %llu\n",
+                      (unsigned long long)bctl->data.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->meta.target, 1) ||
+           bctl->meta.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "metadata profile %llu\n",
+                      (unsigned long long)bctl->meta.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->sys.target, 1) ||
+           bctl->sys.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "system profile %llu\n",
+                      (unsigned long long)bctl->sys.target);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+               printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* allow to reduce meta or sys integrity only if force set */
+       allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID10;
+       if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_system_alloc_bits & allowed) &&
+            !(bctl->sys.target & allowed)) ||
+           ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_metadata_alloc_bits & allowed) &&
+            !(bctl->meta.target & allowed))) {
+               if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                       printk(KERN_INFO "btrfs: force reducing metadata "
+                              "integrity\n");
+               } else {
+                       printk(KERN_ERR "btrfs: balance will reduce metadata "
+                              "integrity, use force if you want this\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+do_balance:
+       ret = insert_balance_item(fs_info->tree_root, bctl);
+       if (ret && ret != -EEXIST)
+               goto out;
+
+       if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+               BUG_ON(ret == -EEXIST);
+               set_balance_control(bctl);
+       } else {
+               BUG_ON(ret != -EEXIST);
+               spin_lock(&fs_info->balance_lock);
+               update_balance_args(bctl);
+               spin_unlock(&fs_info->balance_lock);
+       }
+
+       atomic_inc(&fs_info->balance_running);
+       mutex_unlock(&fs_info->balance_mutex);
+
+       ret = __btrfs_balance(fs_info);
+
+       mutex_lock(&fs_info->balance_mutex);
+       atomic_dec(&fs_info->balance_running);
+
+       if (bargs) {
+               memset(bargs, 0, sizeof(*bargs));
+               update_ioctl_balance_args(fs_info, 0, bargs);
+       }
+
+       if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+           balance_need_close(fs_info)) {
+               __cancel_balance(fs_info);
+       }
+
+       wake_up(&fs_info->balance_wait_q);
+
+       return ret;
+out:
+       if (bctl->flags & BTRFS_BALANCE_RESUME)
+               __cancel_balance(fs_info);
+       else
+               kfree(bctl);
+       return ret;
+}
+
+static int balance_kthread(void *data)
+{
+       struct btrfs_balance_control *bctl =
+                       (struct btrfs_balance_control *)data;
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       int ret = 0;
+
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+
+       set_balance_control(bctl);
+
+       if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+               printk(KERN_INFO "btrfs: force skipping balance\n");
+       } else {
+               printk(KERN_INFO "btrfs: continuing balance\n");
+               ret = btrfs_balance(bctl, NULL);
+       }
+
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+       struct task_struct *tsk;
+       struct btrfs_balance_control *bctl;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out_bctl;
+       if (ret > 0) { /* ret = -ENOENT; */
+               ret = 0;
+               goto out_bctl;
+       }
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+       bctl->fs_info = tree_root->fs_info;
+       bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+       btrfs_balance_data(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+       btrfs_balance_meta(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+       btrfs_balance_sys(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+       tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+       if (IS_ERR(tsk))
+               ret = PTR_ERR(tsk);
+       else
+               goto out;
+
+out_bctl:
+       kfree(bctl);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+       int ret = 0;
+
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+
+       if (atomic_read(&fs_info->balance_running)) {
+               atomic_inc(&fs_info->balance_pause_req);
+               mutex_unlock(&fs_info->balance_mutex);
+
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+
+               mutex_lock(&fs_info->balance_mutex);
+               /* we are good with balance_ctl ripped off from under us */
+               BUG_ON(atomic_read(&fs_info->balance_running));
+               atomic_dec(&fs_info->balance_pause_req);
+       } else {
+               ret = -ENOTCONN;
+       }
+
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+
+       atomic_inc(&fs_info->balance_cancel_req);
+       /*
+        * if we are running just wait and return, balance item is
+        * deleted in btrfs_balance in this case
+        */
+       if (atomic_read(&fs_info->balance_running)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+               mutex_lock(&fs_info->balance_mutex);
+       } else {
+               /* __cancel_balance needs volume_mutex */
+               mutex_unlock(&fs_info->balance_mutex);
+               mutex_lock(&fs_info->volume_mutex);
+               mutex_lock(&fs_info->balance_mutex);
+
+               if (fs_info->balance_ctl)
+                       __cancel_balance(fs_info);
+
+               mutex_unlock(&fs_info->volume_mutex);
+       }
+
+       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       atomic_dec(&fs_info->balance_cancel_req);
+       mutex_unlock(&fs_info->balance_mutex);
+       return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
        return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
 {
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                max_stripe_size = 1024 * 1024 * 1024;
                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-               max_stripe_size = 256 * 1024 * 1024;
+               /* for larger filesystems, use larger metadata chunks */
+               if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+                       max_stripe_size = 1024 * 1024 * 1024;
+               else
+                       max_stripe_size = 256 * 1024 * 1024;
                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-               max_stripe_size = 8 * 1024 * 1024;
+               max_stripe_size = 32 * 1024 * 1024;
                max_chunk_size = 2 * max_stripe_size;
        } else {
                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (total_avail == 0)
                        continue;
 
-               ret = find_free_dev_extent(trans, device,
+               ret = find_free_dev_extent(device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
 
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-               ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+               ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
                                             item_size);
                BUG_ON(ret);
        }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
                return ret;
 
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                       (fs_info->metadata_alloc_profile &
-                        fs_info->avail_metadata_alloc_bits);
+                               fs_info->avail_metadata_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        sys_chunk_offset = chunk_offset + chunk_size;
 
        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                       (fs_info->system_alloc_profile &
-                        fs_info->avail_system_alloc_bits);
+                               fs_info->avail_system_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        u64 stripe_nr;
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
-       int stripes_allocated = 8;
-       int stripes_required = 1;
        int stripe_index;
        int i;
+       int ret = 0;
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
 
-       if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
-               stripes_allocated = 1;
-again:
-       if (bbio_ret) {
-               bbio = kzalloc(btrfs_bio_size(stripes_allocated),
-                               GFP_NOFS);
-               if (!bbio)
-                       return -ENOMEM;
-
-               atomic_set(&bbio->error, 0);
-       }
-
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
        if (mirror_num > map->num_stripes)
                mirror_num = 0;
 
-       /* if our btrfs_bio struct is too small, back off and try again */
-       if (rw & REQ_WRITE) {
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_DUP)) {
-                       stripes_required = map->num_stripes;
-                       max_errors = 1;
-               } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                       stripes_required = map->sub_stripes;
-                       max_errors = 1;
-               }
-       }
-       if (rw & REQ_DISCARD) {
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_DUP |
-                                BTRFS_BLOCK_GROUP_RAID10)) {
-                       stripes_required = map->num_stripes;
-               }
-       }
-       if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
-           stripes_allocated < stripes_required) {
-               stripes_allocated = map->num_stripes;
-               free_extent_map(em);
-               kfree(bbio);
-               goto again;
-       }
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
 
        if (rw & REQ_DISCARD)
                *length = min_t(u64, em->len - offset, *length);
-       else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                             BTRFS_BLOCK_GROUP_RAID1 |
-                             BTRFS_BLOCK_GROUP_RAID10 |
-                             BTRFS_BLOCK_GROUP_DUP)) {
+       else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
                                map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
        }
        BUG_ON(stripe_index >= map->num_stripes);
 
+       bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+       if (!bbio) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       atomic_set(&bbio->error, 0);
+
        if (rw & REQ_DISCARD) {
+               int factor = 0;
+               int sub_stripes = 0;
+               u64 stripes_per_dev = 0;
+               u32 remaining_stripes = 0;
+
+               if (map->type &
+                   (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+                       if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+                               sub_stripes = 1;
+                       else
+                               sub_stripes = map->sub_stripes;
+
+                       factor = map->num_stripes / sub_stripes;
+                       stripes_per_dev = div_u64_rem(stripe_nr_end -
+                                                     stripe_nr_orig,
+                                                     factor,
+                                                     &remaining_stripes);
+               }
+
                for (i = 0; i < num_stripes; i++) {
                        bbio->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
-                       if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-                               u64 stripes;
-                               u32 last_stripe = 0;
-                               int j;
-
-                               div_u64_rem(stripe_nr_end - 1,
-                                           map->num_stripes,
-                                           &last_stripe);
-
-                               for (j = 0; j < map->num_stripes; j++) {
-                                       u32 test;
-
-                                       div_u64_rem(stripe_nr_end - 1 - j,
-                                                   map->num_stripes, &test);
-                                       if (test == stripe_index)
-                                               break;
-                               }
-                               stripes = stripe_nr_end - 1 - j;
-                               do_div(stripes, map->num_stripes);
-                               bbio->stripes[i].length = map->stripe_len *
-                                       (stripes - stripe_nr + 1);
-
-                               if (i == 0) {
-                                       bbio->stripes[i].length -=
-                                               stripe_offset;
-                                       stripe_offset = 0;
-                               }
-                               if (stripe_index == last_stripe)
-                                       bbio->stripes[i].length -=
-                                               stripe_end_offset;
-                       } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                               u64 stripes;
-                               int j;
-                               int factor = map->num_stripes /
-                                            map->sub_stripes;
-                               u32 last_stripe = 0;
-
-                               div_u64_rem(stripe_nr_end - 1,
-                                           factor, &last_stripe);
-                               last_stripe *= map->sub_stripes;
-
-                               for (j = 0; j < factor; j++) {
-                                       u32 test;
-
-                                       div_u64_rem(stripe_nr_end - 1 - j,
-                                                   factor, &test);
-
-                                       if (test ==
-                                           stripe_index / map->sub_stripes)
-                                               break;
-                               }
-                               stripes = stripe_nr_end - 1 - j;
-                               do_div(stripes, factor);
-                               bbio->stripes[i].length = map->stripe_len *
-                                       (stripes - stripe_nr + 1);
-
-                               if (i < map->sub_stripes) {
+                       if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                        BTRFS_BLOCK_GROUP_RAID10)) {
+                               bbio->stripes[i].length = stripes_per_dev *
+                                                         map->stripe_len;
+                               if (i / sub_stripes < remaining_stripes)
+                                       bbio->stripes[i].length +=
+                                               map->stripe_len;
+                               if (i < sub_stripes)
                                        bbio->stripes[i].length -=
                                                stripe_offset;
-                                       if (i == map->sub_stripes - 1)
-                                               stripe_offset = 0;
-                               }
-                               if (stripe_index >= last_stripe &&
-                                   stripe_index <= (last_stripe +
-                                                    map->sub_stripes - 1)) {
+                               if ((i / sub_stripes + 1) %
+                                   sub_stripes == remaining_stripes)
                                        bbio->stripes[i].length -=
                                                stripe_end_offset;
-                               }
+                               if (i == sub_stripes - 1)
+                                       stripe_offset = 0;
                        } else
                                bbio->stripes[i].length = *length;
 
@@ -3155,15 +3806,22 @@ again:
                        stripe_index++;
                }
        }
-       if (bbio_ret) {
-               *bbio_ret = bbio;
-               bbio->num_stripes = num_stripes;
-               bbio->max_errors = max_errors;
-               bbio->mirror_num = mirror_num;
+
+       if (rw & REQ_WRITE) {
+               if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10 |
+                                BTRFS_BLOCK_GROUP_DUP)) {
+                       max_errors = 1;
+               }
        }
+
+       *bbio_ret = bbio;
+       bbio->num_stripes = num_stripes;
+       bbio->max_errors = max_errors;
+       bbio->mirror_num = mirror_num;
 out:
        free_extent_map(em);
-       return 0;
+       return ret;
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & REQ_WRITE)) {
                bio_get(bio);
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
                bio_put(bio);
                return 0;
        }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
                        else
-                               submit_bio(rw, bio);
+                               btrfsic_submit_bio(rw, bio);
                } else {
                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                        bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        struct btrfs_fs_devices *fs_devices;
        int ret;
 
-       mutex_lock(&uuid_mutex);
+       BUG_ON(!mutex_is_locked(&uuid_mutex));
 
        fs_devices = root->fs_info->fs_devices->seed;
        while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        fs_devices->seed = root->fs_info->fs_devices->seed;
        root->fs_info->fs_devices->seed = fs_devices;
 out:
-       mutex_unlock(&uuid_mutex);
        return ret;
 }
 
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
        if (!path)
                return -ENOMEM;
 
+       mutex_lock(&uuid_mutex);
+       lock_chunks(root);
+
        /* first we search for all of the device items, and then we
         * read in all of the chunk items.  This way we can create chunk
         * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
        }
        ret = 0;
 error:
+       unlock_chunks(root);
+       mutex_unlock(&uuid_mutex);
+
        btrfs_free_path(path);
        return ret;
 }
index 78f2d4d..19ac950 100644 (file)
@@ -186,6 +186,51 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
 
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA             (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM           (1ULL << 1)
+#define BTRFS_BALANCE_METADATA         (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK                (BTRFS_BALANCE_DATA |       \
+                                        BTRFS_BALANCE_SYSTEM |     \
+                                        BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE            (1ULL << 3)
+#define BTRFS_BALANCE_RESUME           (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES    (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE       (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID       (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE      (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE      (1ULL << 4)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT     (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT                (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+       struct btrfs_fs_info *fs_info;
+
+       struct btrfs_balance_args data;
+       struct btrfs_balance_args meta;
+       struct btrfs_balance_args sys;
+
+       u64 flags;
+
+       struct btrfs_balance_progress stat;
+};
+
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
 
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
                                       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                        struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
 #endif
index 3848b04..e7a5659 100644 (file)
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 out:
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        return ret;
 }
 
index c283a1e..208c6aa 100644 (file)
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-       char *tmp, *result;
-
-       result = ERR_PTR(-ENOMEM);
-       tmp = __getname();
-       if (tmp)  {
-               int retval = do_getname(filename, tmp);
-
-               result = tmp;
-               if (retval < 0) {
-                       if (retval == -ENOENT && empty)
-                               *empty = 1;
-                       if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                               __putname(tmp);
-                               result = ERR_PTR(retval);
-                       }
+       char *result = __getname();
+       int retval;
+
+       if (!result)
+               return ERR_PTR(-ENOMEM);
+
+       retval = do_getname(filename, result);
+       if (retval < 0) {
+               if (retval == -ENOENT && empty)
+                       *empty = 1;
+               if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+                       __putname(result);
+                       return ERR_PTR(retval);
                }
        }
        audit_getname(result);
index 5485a53..9cde9ed 100644 (file)
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
        return result;
 }
 
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
-       struct mm_struct *mm;
-
-       mm = get_task_mm(task);
-       if (!mm)
-               return ERR_PTR(-EINVAL);
-
-       /*
-        * A task can always look at itself, in case it chooses
-        * to use system calls instead of load instructions.
-        */
-       if (task == current)
-               return mm;
-
-       /*
-        * If current is actively ptrace'ing, and would also be
-        * permitted to freshly attach with ptrace now, permit it.
-        */
-       if (task_is_stopped_or_traced(task)) {
-               int match;
-               rcu_read_lock();
-               match = (ptrace_parent(task) == current);
-               rcu_read_unlock();
-               if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-                       return mm;
-       }
-
-       /*
-        * No one else is allowed.
-        */
-       mmput(mm);
-       return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
-       struct mm_struct *mm;
-       int err;
-
-       /*
-        * Avoid racing if task exec's as we might get a new mm but validate
-        * against old credentials.
-        */
-       err = mutex_lock_killable(&task->signal->cred_guard_mutex);
-       if (err)
-               return ERR_PTR(err);
-
-       mm = __check_mem_permission(task);
-       mutex_unlock(&task->signal->cred_guard_mutex);
-
-       return mm;
-}
-
-struct mm_struct *mm_for_maps(struct task_struct *task)
+static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 {
        struct mm_struct *mm;
        int err;
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 
        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
-                       !ptrace_may_access(task, PTRACE_MODE_READ)) {
+                       !ptrace_may_access(task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
        return mm;
 }
 
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+       return mm_access(task, PTRACE_MODE_READ);
+}
+
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
        int res = 0;
@@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = {
 
 static int mem_open(struct inode* inode, struct file* file)
 {
-       file->private_data = (void*)((long)current->self_exec_id);
+       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+       struct mm_struct *mm;
+
+       if (!task)
+               return -ESRCH;
+
+       mm = mm_access(task, PTRACE_MODE_ATTACH);
+       put_task_struct(task);
+
+       if (IS_ERR(mm))
+               return PTR_ERR(mm);
+
        /* OK to pass negative loff_t, we can catch out-of-range */
        file->f_mode |= FMODE_UNSIGNED_OFFSET;
+       file->private_data = mm;
+
        return 0;
 }
 
 static ssize_t mem_read(struct file * file, char __user * buf,
                        size_t count, loff_t *ppos)
 {
-       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+       int ret;
        char *page;
        unsigned long src = *ppos;
-       int ret = -ESRCH;
-       struct mm_struct *mm;
+       struct mm_struct *mm = file->private_data;
 
-       if (!task)
-               goto out_no_task;
+       if (!mm)
+               return 0;
 
-       ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-               goto out;
-
-       mm = check_mem_permission(task);
-       ret = PTR_ERR(mm);
-       if (IS_ERR(mm))
-               goto out_free;
-
-       ret = -EIO;
-       if (file->private_data != (void*)((long)current->self_exec_id))
-               goto out_put;
+               return -ENOMEM;
 
        ret = 0;
  
@@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
        }
        *ppos = src;
 
-out_put:
-       mmput(mm);
-out_free:
        free_page((unsigned long) page);
-out:
-       put_task_struct(task);
-out_no_task:
        return ret;
 }
 
@@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 {
        int copied;
        char *page;
-       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        unsigned long dst = *ppos;
-       struct mm_struct *mm;
+       struct mm_struct *mm = file->private_data;
 
-       copied = -ESRCH;
-       if (!task)
-               goto out_no_task;
+       if (!mm)
+               return 0;
 
-       copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-               goto out_task;
-
-       mm = check_mem_permission(task);
-       copied = PTR_ERR(mm);
-       if (IS_ERR(mm))
-               goto out_free;
-
-       copied = -EIO;
-       if (file->private_data != (void *)((long)current->self_exec_id))
-               goto out_mm;
+               return -ENOMEM;
 
        copied = 0;
        while (count > 0) {
@@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        }
        *ppos = dst;
 
-out_mm:
-       mmput(mm);
-out_free:
        free_page((unsigned long) page);
-out_task:
-       put_task_struct(task);
-out_no_task:
        return copied;
 }
 
@@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
        return file->f_pos;
 }
 
+static int mem_release(struct inode *inode, struct file *file)
+{
+       struct mm_struct *mm = file->private_data;
+
+       mmput(mm);
+       return 0;
+}
+
 static const struct file_operations proc_mem_operations = {
        .llseek         = mem_lseek,
        .read           = mem_read,
        .write          = mem_write,
        .open           = mem_open,
+       .release        = mem_release,
 };
 
 static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1199,9 +1132,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        ssize_t length;
        uid_t loginuid;
 
-       if (!capable(CAP_AUDIT_CONTROL))
-               return -EPERM;
-
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
@@ -1230,7 +1160,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                goto out_free_page;
 
        }
-       length = audit_set_loginuid(current, loginuid);
+       length = audit_set_loginuid(loginuid);
        if (likely(length == 0))
                length = count;
 
index 574d4ee..74b9baf 100644 (file)
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
        xfs_fsize_t             bsize;
 
        bsize = ioend->io_offset + ioend->io_size;
-       isize = MAX(ip->i_size, ip->i_new_size);
-       isize = MIN(isize, bsize);
+       isize = MIN(i_size_read(VFS_I(ip)), bsize);
        return isize > ip->i_d.di_size ? isize : 0;
 }
 
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.  The
- * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated.  If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
+ * Update on-disk file size now that data has been written to disk.
  *
  * This function does not block as blocking on the inode lock in IO completion
  * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1278,6 +1273,15 @@ xfs_end_io_direct_write(
 {
        struct xfs_ioend        *ioend = iocb->private;
 
+       /*
+        * While the generic direct I/O code updates the inode size, it does
+        * so only after the end_io handler is called, which means our
+        * end_io handler thinks the on-disk size is outside the in-core
+        * size.  To prevent this just update it a little bit earlier here.
+        */
+       if (offset + size > i_size_read(ioend->io_inode))
+               i_size_write(ioend->io_inode, offset + size);
+
        /*
         * blockdev_direct_IO can return an error even after the I/O
         * completion handler was called.  Thus we need to protect
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
 
        if (to > inode->i_size) {
                /*
-                * punch out the delalloc blocks we have already allocated. We
-                * don't call xfs_setattr() to do this as we may be in the
-                * middle of a multi-iovec write and so the vfs inode->i_size
-                * will not match the xfs ip->i_size and so it will zero too
-                * much. Hence we jus truncate the page cache to zero what is
-                * necessary and punch the delalloc blocks directly.
+                * Punch out the delalloc blocks we have already allocated.
+                *
+                * Don't bother with xfs_setattr given that nothing can have
+                * made it to disk yet as the page is still locked at this
+                * point.
                 */
                struct xfs_inode        *ip = XFS_I(inode);
                xfs_fileoff_t           start_fsb;
index 1e5d97f..08b9ac6 100644 (file)
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
        if (error)
                goto out;
 
-       /*
-        * Commit the last in the sequence of transactions.
-        */
-       xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
        error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
index c1b55e5..d25eafd 100644 (file)
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
        dp = args->dp;
        mp = dp->i_mount;
        dp->i_d.di_forkoff = forkoff;
-       dp->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-       dp->i_afp->if_ext_max =
-               XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 
        ifp = dp->i_afp;
        ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
        ASSERT(ip->i_d.di_anextents == 0);
        ASSERT(ip->i_afp == NULL);
 
-       ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
                                (args->op_flags & XFS_DA_OP_ADDNAME) ||
                                !(mp->m_flags & XFS_MOUNT_ATTR2) ||
                                dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-               dp->i_afp->if_ext_max =
-                       XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-               dp->i_df.if_ext_max =
-                       XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
                xfs_trans_log_inode(args->trans, dp,
                                        XFS_ILOG_CORE | XFS_ILOG_ADATA);
        }
index d0ab788..188ef2f 100644 (file)
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
 }
 
 /*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+               XFS_IFORK_NEXTENTS(ip, whichfork) >
+                       XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+               XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                       XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
  * by [off, bno, len, state].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-               if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                   bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                        bma->firstblock, bma->flist,
                                        &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-               if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                   bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                bma->firstblock, bma->flist, &bma->cur, 1,
                                &tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-               if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                   bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                        bma->firstblock, bma->flist, &bma->cur,
                                        1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
        }
 
        /* convert to a btree if necessary */
-       if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
        }
 
        /* convert to a btree if necessary */
-       if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
        }
 
        /* convert to a btree if necessary */
-       if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
 
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        /*
         * Make space in the inode incore.
         */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
            ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
                uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
 
-               if (dfl_forkoff > ip->i_d.di_forkoff) {
+               if (dfl_forkoff > ip->i_d.di_forkoff)
                        ip->i_d.di_forkoff = dfl_forkoff;
-                       ip->i_df.if_ext_max =
-                               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
-                       ip->i_afp->if_ext_max =
-                               XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
-               }
        }
 }
 
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
        int                     error;          /* error return value */
 
        ASSERT(XFS_IFORK_Q(ip) == 0);
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 
        mp = ip->i_mount;
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
                error = XFS_ERROR(EINVAL);
                goto error1;
        }
-       ip->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-       ip->i_afp->if_ext_max =
-               XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
        xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
                } else
                        spin_unlock(&mp->m_sb_lock);
        }
-       if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+       error = xfs_bmap_finish(&tp, &flist, &committed);
+       if (error)
                goto error2;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
-       return error;
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 error2:
        xfs_bmap_cancel(&flist);
 error1:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
 }
 
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
        xfs_bmbt_irec_t s;              /* internal version of extent */
 
 #ifndef DEBUG
-       if (whichfork == XFS_DATA_FORK) {
-               return S_ISREG(ip->i_d.di_mode) ?
-                       (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
-                       (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
-       }
+       if (whichfork == XFS_DATA_FORK)
+               return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
 #endif /* !DEBUG */
        if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
                return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
        xfs_bmbt_get_all(ep, &s);
        rval = s.br_startoff == 0 && s.br_blockcount == 1;
        if (rval && whichfork == XFS_DATA_FORK)
-               ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+               ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
        return rval;
 }
 
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
        XFS_STATS_INC(xs_blk_mapr);
 
        ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
                return XFS_ERROR(EIO);
 
        ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
        XFS_STATS_INC(xs_blk_mapw);
 
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
        /*
         * Transform from btree to extents, give it cur.
         */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-           XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+       if (xfs_bmap_wants_extents(ip, whichfork)) {
                int             tmp_logflags = 0;
 
                ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
                if (error)
                        goto error0;
        }
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-              XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+              XFS_IFORK_NEXTENTS(ip, whichfork) >
+               XFS_IFORK_MAXEXT(ip, whichfork));
        error = 0;
 error0:
        /*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
 
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
            (error = xfs_iread_extents(tp, ip, whichfork)))
                return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
                 */
                if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-                   XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+                   XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+                       XFS_IFORK_MAXEXT(ip, whichfork) &&
                    del.br_startoff > got.br_startoff &&
                    del.br_startoff + del.br_blockcount <
                    got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
                }
        }
        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        /*
         * Convert to a btree if necessary.
         */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(ip, whichfork)) {
                ASSERT(cur == NULL);
                error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
                        &cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
        /*
         * transform from btree to extents, give it cur
         */
-       else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-                XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+       else if (xfs_bmap_wants_extents(ip, whichfork)) {
                ASSERT(cur != NULL);
                error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
                        whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
        /*
         * transform from extents to local?
         */
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        error = 0;
 error0:
        /*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
                out->bmv_block = -1;
-               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
                fixlen -= out->bmv_offset;
                if (prealloced && out->bmv_offset + out->bmv_length == end) {
                        /* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
                        fixlen = XFS_MAXIOFFSET(mp);
                } else {
                        prealloced = 0;
-                       fixlen = ip->i_size;
+                       fixlen = XFS_ISIZE(ip);
                }
        }
 
@@ -5551,7 +5541,7 @@ xfs_getbmap(
 
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-               if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+               if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
                        error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
                        if (error)
                                goto out_unlock_iolock;
index 654dc6f..dd974a5 100644 (file)
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
 
        /* Check temp in extent form to max in target */
        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
                return EINVAL;
 
        /* Check target in extent form to max in temp */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
                return EINVAL;
 
        /*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
         * (a common defrag case) which will occur when the temp inode is in
         * extent format...
         */
-       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-           ((XFS_IFORK_BOFF(ip) &&
-             tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
-            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
-               return EINVAL;
+       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(ip) &&
+                   tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+                       return EINVAL;
+               if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
 
        /* Reciprocal target->temp btree format checks */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-           ((XFS_IFORK_BOFF(tip) &&
-             ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
-            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
-               return EINVAL;
+       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(tip) &&
+                   ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+                       return EINVAL;
+
+               if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
 
        return 0;
 }
@@ -348,16 +357,6 @@ xfs_swap_extents(
        *ifp = *tifp;           /* struct copy */
        *tifp = *tempifp;       /* struct copy */
 
-       /*
-        * Fix the in-memory data fork values that are dependent on the fork
-        * offset in the inode. We can't assume they remain the same as attr2
-        * has dynamic fork offsets.
-        */
-       ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
-                                       (uint)sizeof(xfs_bmbt_rec_t);
-       tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
-                                       (uint)sizeof(xfs_bmbt_rec_t);
-
        /*
         * Fix the on-disk inode values
         */
index f675f3d..7e5bc87 100644 (file)
@@ -327,7 +327,7 @@ xfs_file_aio_read(
                                mp->m_rtdev_targp : mp->m_ddev_targp;
                if ((iocb->ki_pos & target->bt_smask) ||
                    (size & target->bt_smask)) {
-                       if (iocb->ki_pos == ip->i_size)
+                       if (iocb->ki_pos == i_size_read(inode))
                                return 0;
                        return -XFS_ERROR(EINVAL);
                }
@@ -412,51 +412,6 @@ xfs_file_splice_read(
        return ret;
 }
 
-STATIC void
-xfs_aio_write_isize_update(
-       struct inode    *inode,
-       loff_t          *ppos,
-       ssize_t         bytes_written)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             isize = i_size_read(inode);
-
-       if (bytes_written > 0)
-               XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
-       if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
-                                       *ppos > isize))
-               *ppos = isize;
-
-       if (*ppos > ip->i_size) {
-               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               if (*ppos > ip->i_size)
-                       ip->i_size = *ppos;
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred.  In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             new_size)
-{
-       if (new_size == ip->i_new_size) {
-               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               if (new_size == ip->i_new_size)
-                       ip->i_new_size = 0;
-               if (ip->i_d.di_size > ip->i_size)
-                       ip->i_d.di_size = ip->i_size;
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
 /*
  * xfs_file_splice_write() does not use xfs_rw_ilock() because
  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
 
@@ -489,19 +443,12 @@ xfs_file_splice_write(
 
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-       new_size = *ppos + count;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if (new_size > ip->i_size)
-               ip->i_new_size = new_size;
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+       if (ret > 0)
+               XFS_STATS_ADD(xs_write_bytes, ret);
 
-       xfs_aio_write_isize_update(inode, ppos, ret);
-       xfs_aio_write_newsize_update(ip, new_size);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -689,28 +636,26 @@ out_lock:
 /*
  * Common pre-write limit and setup checks.
  *
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
  */
 STATIC ssize_t
 xfs_file_aio_write_checks(
        struct file             *file,
        loff_t                  *pos,
        size_t                  *count,
-       xfs_fsize_t             *new_sizep,
        int                     *iolock)
 {
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             new_size;
        int                     error = 0;
 
        xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-       *new_sizep = 0;
 restart:
        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
        if (error) {
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-               *iolock = 0;
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
                return error;
        }
 
@@ -720,36 +665,21 @@ restart:
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
-        * write. There is no need to issue zeroing if another in-flght IO ends
-        * at or before this one If zeronig is needed and we are currently
-        * holding the iolock shared, we need to update it to exclusive which
-        * involves dropping all locks and relocking to maintain correct locking
-        * order. If we do this, restart the function to ensure all checks and
-        * values are still valid.
+        * write.  If zeroing is needed and we are currently holding the
+        * iolock shared, we need to update it to exclusive which involves
+        * dropping all locks and relocking to maintain correct locking order.
+        * If we do this, restart the function to ensure all checks and values
+        * are still valid.
         */
-       if ((ip->i_new_size && *pos > ip->i_new_size) ||
-           (!ip->i_new_size && *pos > ip->i_size)) {
+       if (*pos > i_size_read(inode)) {
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
                        goto restart;
                }
-               error = -xfs_zero_eof(ip, *pos, ip->i_size);
+               error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
        }
-
-       /*
-        * If this IO extends beyond EOF, we may need to update ip->i_new_size.
-        * We have already zeroed space beyond EOF (if necessary).  Only update
-        * ip->i_new_size if this IO ends beyond any other in-flight writes.
-        */
-       new_size = *pos + *count;
-       if (new_size > ip->i_size) {
-               if (new_size > ip->i_new_size)
-                       ip->i_new_size = new_size;
-               *new_sizep = new_size;
-       }
-
        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-       size_t                  ocount,
-       xfs_fsize_t             *new_size,
-       int                     *iolock)
+       size_t                  ocount)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
        ssize_t                 ret = 0;
        size_t                  count = ocount;
        int                     unaligned_io = 0;
+       int                     iolock;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
-       *iolock = 0;
        if ((pos & target->bt_smask) || (count & target->bt_smask))
                return -XFS_ERROR(EINVAL);
 
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
         * EOF zeroing cases and fill out the new inode size as appropriate.
         */
        if (unaligned_io || mapping->nrpages)
-               *iolock = XFS_IOLOCK_EXCL;
+               iolock = XFS_IOLOCK_EXCL;
        else
-               *iolock = XFS_IOLOCK_SHARED;
-       xfs_rw_ilock(ip, *iolock);
+               iolock = XFS_IOLOCK_SHARED;
+       xfs_rw_ilock(ip, iolock);
 
        /*
         * Recheck if there are cached pages that need invalidate after we got
         * the iolock to protect against other threads adding new pages while
         * we were waiting for the iolock.
         */
-       if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
-               xfs_rw_iunlock(ip, *iolock);
-               *iolock = XFS_IOLOCK_EXCL;
-               xfs_rw_ilock(ip, *iolock);
+       if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+               xfs_rw_iunlock(ip, iolock);
+               iolock = XFS_IOLOCK_EXCL;
+               xfs_rw_ilock(ip, iolock);
        }
 
-       ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+       ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
-               return ret;
+               goto out;
 
        if (mapping->nrpages) {
                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
                                                        FI_REMAPF_LOCKED);
                if (ret)
-                       return ret;
+                       goto out;
        }
 
        /*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
         */
        if (unaligned_io)
                inode_dio_wait(inode);
-       else if (*iolock == XFS_IOLOCK_EXCL) {
+       else if (iolock == XFS_IOLOCK_EXCL) {
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-               *iolock = XFS_IOLOCK_SHARED;
+               iolock = XFS_IOLOCK_SHARED;
        }
 
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_direct_write(iocb, iovp,
                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
 
+out:
+       xfs_rw_iunlock(ip, iolock);
+
        /* No fallback to buffered IO on errors for XFS. */
        ASSERT(ret < 0 || ret == count);
        return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-       size_t                  ocount,
-       xfs_fsize_t             *new_size,
-       int                     *iolock)
+       size_t                  ocount)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
        int                     enospc = 0;
+       int                     iolock = XFS_IOLOCK_EXCL;
        size_t                  count = ocount;
 
-       *iolock = XFS_IOLOCK_EXCL;
-       xfs_rw_ilock(ip, *iolock);
+       xfs_rw_ilock(ip, iolock);
 
-       ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+       ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
-               return ret;
+               goto out;
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
         * page locks and retry *once*
         */
        if (ret == -ENOSPC && !enospc) {
-               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-               if (ret)
-                       return ret;
                enospc = 1;
-               goto write_retry;
+               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+               if (!ret)
+                       goto write_retry;
        }
+
        current->backing_dev_info = NULL;
+out:
+       xfs_rw_iunlock(ip, iolock);
        return ret;
 }
 
@@ -930,9 +861,7 @@ xfs_file_aio_write(
        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
-       int                     iolock;
        size_t                  ocount = 0;
-       xfs_fsize_t             new_size = 0;
 
        XFS_STATS_INC(xs_write_calls);
 
@@ -951,33 +880,22 @@ xfs_file_aio_write(
                return -EIO;
 
        if (unlikely(file->f_flags & O_DIRECT))
-               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &new_size, &iolock);
+               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
        else
                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &new_size, &iolock);
-
-       xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+                                                 ocount);
 
-       if (ret <= 0)
-               goto out_unlock;
+       if (ret > 0) {
+               ssize_t err;
 
-       /* Handle various SYNC-type writes */
-       if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-               loff_t end = pos + ret - 1;
-               int error;
+               XFS_STATS_ADD(xs_write_bytes, ret);
 
-               xfs_rw_iunlock(ip, iolock);
-               error = xfs_file_fsync(file, pos, end,
-                                     (file->f_flags & __O_SYNC) ? 0 : 1);
-               xfs_rw_ilock(ip, iolock);
-               if (error)
-                       ret = error;
+               /* Handle various SYNC-type writes */
+               err = generic_write_sync(file, pos, ret);
+               if (err < 0)
+                       ret = err;
        }
 
-out_unlock:
-       xfs_aio_write_newsize_update(ip, new_size);
-       xfs_rw_iunlock(ip, iolock);
        return ret;
 }
 
index ed88ed1..652b875 100644 (file)
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
 
        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                return -filemap_fdatawait_range(mapping, first,
-                                       last == -1 ? ip->i_size - 1 : last);
+                                       last == -1 ? XFS_ISIZE(ip) - 1 : last);
        }
        return 0;
 }
index 3960a06..8c3e463 100644 (file)
@@ -77,7 +77,7 @@ xfs_inode_alloc(
 
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(completion_done(&ip->i_flush));
+       ASSERT(!xfs_isiflocked(ip));
        ASSERT(ip->i_ino == 0);
 
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
        ip->i_update_core = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-       ip->i_size = 0;
-       ip->i_new_size = 0;
 
        return ip;
 }
@@ -150,7 +148,7 @@ xfs_inode_free(
        /* asserts to verify all state is correct here */
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(completion_done(&ip->i_flush));
+       ASSERT(!xfs_isiflocked(ip));
 
        /*
         * Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
 
        *ipp = ip;
 
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
        return 0;
 }
 #endif
+
+void
+__xfs_iflock(
+       struct xfs_inode        *ip)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+       do {
+               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_isiflocked(ip))
+                       io_schedule();
+       } while (!xfs_iflock_nowait(ip));
+
+       finish_wait(wq, &wait.wait);
+}
index 9dda7cc..b210224 100644 (file)
@@ -299,11 +299,8 @@ xfs_iformat(
 {
        xfs_attr_shortform_t    *atp;
        int                     size;
-       int                     error;
+       int                     error = 0;
        xfs_fsize_t             di_size;
-       ip->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-       error = 0;
 
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
-               ip->i_size = 0;
                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
 
@@ -409,10 +405,10 @@ xfs_iformat(
        }
        if (!XFS_DFORK_Q(dip))
                return 0;
+
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-       ip->i_afp->if_ext_max =
-               XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
         * or the number of extents is greater than the number of
         * blocks.
         */
-       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
-           || XFS_BMDR_SPACE_CALC(nrecs) >
-                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
-           || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                       XFS_IFORK_MAXEXT(ip, whichfork) ||
+                    XFS_BMDR_SPACE_CALC(nrecs) >
+                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
                 * with the uninitialized part of it.
                 */
                ip->i_d.di_mode = 0;
-               /*
-                * Initialize the per-fork minima and maxima for a new
-                * inode here.  xfs_iformat will do it for old inodes.
-                */
-               ip->i_df.if_ext_max =
-                       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
 
        /*
@@ -861,7 +852,6 @@ xfs_iread(
        }
 
        ip->i_delayed_blks = 0;
-       ip->i_size = ip->i_d.di_size;
 
        /*
         * Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
        }
 
        ip->i_d.di_size = 0;
-       ip->i_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
 
@@ -1165,52 +1154,6 @@ xfs_ialloc(
        return 0;
 }
 
-/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file.  We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             isize)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           map_first;
-       int                     nimaps;
-       xfs_bmbt_irec_t         imaps[2];
-       int                     error;
-
-       if (!S_ISREG(ip->i_d.di_mode))
-               return;
-
-       if (XFS_IS_REALTIME_INODE(ip))
-               return;
-
-       if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-               return;
-
-       nimaps = 2;
-       map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       /*
-        * The filesystem could be shutting down, so bmapi may return
-        * an error.
-        */
-       error = xfs_bmapi_read(ip, map_first,
-                        (XFS_B_TO_FSB(mp,
-                              (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
-                        imaps, &nimaps, XFS_BMAPI_ENTIRE);
-       if (error)
-               return;
-       ASSERT(nimaps == 1);
-       ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else  /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif /* DEBUG */
-
 /*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
        int                     done = 0;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-       ASSERT(new_size <= ip->i_size);
+       ASSERT(new_size <= XFS_ISIZE(ip));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(ip->i_itemp != NULL);
        ASSERT(ip->i_itemp->ili_lock_flags == 0);
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
+       trace_xfs_itruncate_extents_start(ip, new_size);
+
        /*
         * Since it is possible for space to become allocated beyond
         * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
                        goto out;
        }
 
+       /*
+        * Always re-log the inode so that our permanent transaction can keep
+        * on rolling it forward in the log.
+        */
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       trace_xfs_itruncate_extents_end(ip, new_size);
+
 out:
        *tpp = tp;
        return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
        goto out;
 }
 
-int
-xfs_itruncate_data(
-       struct xfs_trans        **tpp,
-       struct xfs_inode        *ip,
-       xfs_fsize_t             new_size)
-{
-       int                     error;
-
-       trace_xfs_itruncate_data_start(ip, new_size);
-
-       /*
-        * The first thing we do is set the size to new_size permanently on
-        * disk.  This way we don't have to worry about anyone ever being able
-        * to look at the data being freed even in the face of a crash.
-        * What we're getting around here is the case where we free a block, it
-        * is allocated to another file, it is written to, and then we crash.
-        * If the new data gets written to the file but the log buffers
-        * containing the free and reallocation don't, then we'd end up with
-        * garbage in the blocks being freed.  As long as we make the new_size
-        * permanent before actually freeing any blocks it doesn't matter if
-        * they get written to.
-        */
-       if (ip->i_d.di_nextents > 0) {
-               /*
-                * If we are not changing the file size then do not update
-                * the on-disk file size - we may be called from
-                * xfs_inactive_free_eofblocks().  If we update the on-disk
-                * file size and then the system crashes before the contents
-                * of the file are flushed to disk then the files may be
-                * full of holes (ie NULL files bug).
-                */
-               if (ip->i_size != new_size) {
-                       ip->i_d.di_size = new_size;
-                       ip->i_size = new_size;
-                       xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-               }
-       }
-
-       error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
-       if (error)
-               return error;
-
-       /*
-        * If we are not changing the file size then do not update the on-disk
-        * file size - we may be called from xfs_inactive_free_eofblocks().
-        * If we update the on-disk file size and then the system crashes
-        * before the contents of the file are flushed to disk then the files
-        * may be full of holes (ie NULL files bug).
-        */
-       xfs_isize_check(ip, new_size);
-       if (ip->i_size != new_size) {
-               ip->i_d.di_size = new_size;
-               ip->i_size = new_size;
-       }
-
-       ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
-       ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
-       /*
-        * Always re-log the inode so that our permanent transaction can keep
-        * on rolling it forward in the log.
-        */
-       xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
-       trace_xfs_itruncate_data_end(ip, new_size);
-       return 0;
-}
-
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
@@ -1824,8 +1709,7 @@ xfs_ifree(
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_nextents == 0);
        ASSERT(ip->i_d.di_anextents == 0);
-       ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
-              (!S_ISREG(ip->i_d.di_mode)));
+       ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
        ASSERT(ip->i_d.di_nblocks == 0);
 
        /*
@@ -1844,8 +1728,6 @@ xfs_ifree(
        ip->i_d.di_flags = 0;
        ip->i_d.di_dmevmask = 0;
        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
-       ip->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        /*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
  * once someone is waiting for it to be unpinned.
  */
 static void
-xfs_iunpin_nowait(
+xfs_iunpin(
        struct xfs_inode        *ip)
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
 
 }
 
+static void
+__xfs_iunpin_wait(
+       struct xfs_inode        *ip)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+       xfs_iunpin(ip);
+
+       do {
+               prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_ipincount(ip))
+                       io_schedule();
+       } while (xfs_ipincount(ip));
+       finish_wait(wq, &wait.wait);
+}
+
 void
 xfs_iunpin_wait(
        struct xfs_inode        *ip)
 {
-       if (xfs_ipincount(ip)) {
-               xfs_iunpin_nowait(ip);
-               wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
-       }
+       if (xfs_ipincount(ip))
+               __xfs_iunpin_wait(ip);
 }
 
 /*
@@ -2510,9 +2407,9 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(!completion_done(&ip->i_flush));
+       ASSERT(xfs_isiflocked(ip));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-              ip->i_d.di_nextents > ip->i_df.if_ext_max);
+              ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
        iip = ip->i_itemp;
        mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
         * out for us if they occur after the log force completes.
         */
        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
-               xfs_iunpin_nowait(ip);
+               xfs_iunpin(ip);
                xfs_ifunlock(ip);
                return EAGAIN;
        }
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
 #endif
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(!completion_done(&ip->i_flush));
+       ASSERT(xfs_isiflocked(ip));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-              ip->i_d.di_nextents > ip->i_df.if_ext_max);
+              ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
        iip = ip->i_itemp;
        mp = ip->i_mount;
index f0e6b15..2f27b74 100644 (file)
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
-       unsigned char           if_ext_max;     /* max # of extent records */
        union {
                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
        ((w) == XFS_DATA_FORK ? \
                ((ip)->i_d.di_nextents = (n)) : \
                ((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 
 
 #ifdef __KERNEL__
 
-struct bhv_desc;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 
-typedef struct dm_attrs_s {
-       __uint32_t      da_dmevmask;    /* DMIG event mask */
-       __uint16_t      da_dmstate;     /* DMIG state info */
-       __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
-
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
-       struct completion       i_flush;        /* inode flush completion q */
        atomic_t                i_pincount;     /* inode pin count */
-       wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
-       unsigned short          i_flags;        /* see defined flags below */
+       unsigned long           i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
 
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
 
-       xfs_fsize_t             i_size;         /* in-memory size */
-       xfs_fsize_t             i_new_size;     /* size when write completes */
-
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
 } xfs_inode_t;
 
-#define XFS_ISIZE(ip)  S_ISREG((ip)->i_d.di_mode) ? \
-                               (ip)->i_size : (ip)->i_d.di_size;
-
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -277,6 +262,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
        return &ip->i_vnode;
 }
 
+/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+       if (S_ISREG(ip->i_d.di_mode))
+               return i_size_read(VFS_I(ip));
+       return ip->i_d.di_size;
+}
+
 /*
  * i_flags helper functions
  */
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        return ret;
 }
 
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+       int ret;
+
+       spin_lock(&ip->i_flags_lock);
+       ret = ip->i_flags & flags;
+       if (!ret)
+               ip->i_flags |= flags;
+       spin_unlock(&ip->i_flags_lock);
+       return ret;
+}
+
 /*
  * Project quota id helpers (previously projid was 16bit only
  * and using two 16bit values to hold new 32bit projid was chosen
@@ -350,36 +360,20 @@ xfs_set_projid(struct xfs_inode *ip,
        ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
 }
 
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-       wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-       return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-       complete(&ip->i_flush);
-}
-
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM           0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE             0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE       0x0004  /* inode can be reclaimed */
-#define XFS_INEW               0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM                0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED         0x0020  /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE     0x0040  /* dirty release already seen */
+#define XFS_IRECLAIM           (1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE             (1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE       (1 << 2) /* inode can be reclaimed */
+#define XFS_INEW               (1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM                (1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED         (1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE     (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT       7        /* inode is being flushed right now */
+#define XFS_IFLOCK             (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT      8        /* wakeup key for zero pin count */
+#define XFS_IPINNED            (1 << __XFS_IPINNED_BIT)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -391,6 +385,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
         XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
         XFS_IFILESTREAM);
 
+/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+       return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+       if (!xfs_iflock_nowait(ip))
+               __xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+       xfs_iflags_clear(ip, XFS_IFLOCK);
+       wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+       return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
 /*
  * Flags for inode locking.
  * Bit ranges: 1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
@@ -491,8 +513,6 @@ int         xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                           struct xfs_bmap_free *);
 int            xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
                                      int, xfs_fsize_t);
-int            xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
-                                  xfs_fsize_t);
 int            xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
index cfd6c7f..91d71dc 100644 (file)
@@ -79,8 +79,6 @@ xfs_inode_item_size(
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               ASSERT(ip->i_df.if_ext_max ==
-                      XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
                iip->ili_format.ilf_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
                          XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
-               wake_up(&ip->i_ipin_wait);
+               wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 
 /*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
         * If a flush is not in progress anymore, chances are that the
         * inode was taken off the AIL. So, just get out.
         */
-       if (completion_done(&ip->i_flush) ||
+       if (!xfs_isiflocked(ip) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
        struct xfs_inode        *ip = iip->ili_inode;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-       ASSERT(!completion_done(&ip->i_flush));
+       ASSERT(xfs_isiflocked(ip));
 
        /*
         * Since we were able to lock the inode's flush lock and
index 9afa282..246c7d5 100644 (file)
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
        xfs_fileoff_t   *last_fsb)
 {
        xfs_fileoff_t   new_last_fsb = 0;
-       xfs_extlen_t    align;
+       xfs_extlen_t    align = 0;
        int             eof, error;
 
-       if (XFS_IS_REALTIME_INODE(ip))
-               ;
-       /*
-        * If mounted with the "-o swalloc" option, roundup the allocation
-        * request to a stripe width boundary if the file size is >=
-        * stripe width and we are allocating past the allocation eof.
-        */
-       else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-               (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
-               new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
-       /*
-        * Roundup the allocation request to a stripe unit (m_dalign) boundary
-        * if the file size is >= stripe unit size, and we are allocating past
-        * the allocation eof.
-        */
-       else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
-               new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+       if (!XFS_IS_REALTIME_INODE(ip)) {
+               /*
+                * Round up the allocation request to a stripe unit
+                * (m_dalign) boundary if the file size is >= stripe unit
+                * size, and we are allocating past the allocation eof.
+                *
+                * If mounted with the "-o swalloc" option the alignment is
+                * increased from the strip unit size to the stripe width.
+                */
+               if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+                       align = mp->m_swidth;
+               else if (mp->m_dalign)
+                       align = mp->m_dalign;
+
+               if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+                       new_last_fsb = roundup_64(*last_fsb, align);
+       }
 
        /*
         * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
 
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       if ((offset + count) > ip->i_size) {
+       if ((offset + count) > XFS_ISIZE(ip)) {
                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
                        goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip, 0);
 
        bmapi_flag = 0;
-       if (offset < ip->i_size || extsz)
+       if (offset < XFS_ISIZE(ip) || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
 
        /*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
        int             found_delalloc = 0;
 
        *prealloc = 0;
-       if ((offset + count) <= ip->i_size)
+       if (offset + count <= XFS_ISIZE(ip))
                return 0;
 
        /*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
                 * ensure we always pass in a non-zero value.
                 */
-               alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+               alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
                                        rounddown_pow_of_two(alloc_blocks));
 
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
                         * back....
                         */
                        nimaps = 1;
-                       end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+                       end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
                        error = xfs_bmap_last_offset(NULL, ip, &last_block,
                                                        XFS_DATA_FORK);
                        if (error)
index f9babd1..ab30253 100644 (file)
@@ -750,6 +750,7 @@ xfs_setattr_size(
        struct xfs_mount        *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
        int                     mask = iattr->ia_valid;
+       xfs_off_t               oldsize, newsize;
        struct xfs_trans        *tp;
        int                     error;
        uint                    lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
                lock_flags |= XFS_IOLOCK_EXCL;
        xfs_ilock(ip, lock_flags);
 
+       oldsize = inode->i_size;
+       newsize = iattr->ia_size;
+
        /*
         * Short circuit the truncate case for zero length files.
         */
-       if (iattr->ia_size == 0 &&
-           ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+       if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
                if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
                        goto out_unlock;
 
@@ -807,14 +810,14 @@ xfs_setattr_size(
         * the inode to the transaction, because the inode cannot be unlocked
         * once it is a part of the transaction.
         */
-       if (iattr->ia_size > ip->i_size) {
+       if (newsize > oldsize) {
                /*
                 * Do the first part of growing a file: zero any data in the
                 * last block that is beyond the old EOF.  We need to do this
                 * before the inode is joined to the transaction to modify
                 * i_size.
                 */
-               error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+               error = xfs_zero_eof(ip, newsize, oldsize);
                if (error)
                        goto out_unlock;
        }
@@ -833,8 +836,8 @@ xfs_setattr_size(
         * here and prevents waiting for other data not within the range we
         * care about here.
         */
-       if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-               error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+       if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+               error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
                                        FI_NONE);
                if (error)
                        goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
         */
        inode_dio_wait(inode);
 
-       error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
-                                    xfs_get_blocks);
+       error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                goto out_unlock;
 
@@ -857,7 +859,7 @@ xfs_setattr_size(
        if (error)
                goto out_trans_cancel;
 
-       truncate_setsize(inode, iattr->ia_size);
+       truncate_setsize(inode, newsize);
 
        commit_flags = XFS_TRANS_RELEASE_LOG_RES;
        lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
-       if (iattr->ia_size != ip->i_size &&
-           (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+       if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
                iattr->ia_ctime = iattr->ia_mtime =
                        current_fs_time(inode->i_sb);
                mask |= ATTR_CTIME | ATTR_MTIME;
        }
 
-       if (iattr->ia_size > ip->i_size) {
-               ip->i_d.di_size = iattr->ia_size;
-               ip->i_size = iattr->ia_size;
-       } else if (iattr->ia_size <= ip->i_size ||
-                  (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
-               error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+       /*
+        * The first thing we do is set the size to new_size permanently on
+        * disk.  This way we don't have to worry about anyone ever being able
+        * to look at the data being freed even in the face of a crash.
+        * What we're getting around here is the case where we free a block, it
+        * is allocated to another file, it is written to, and then we crash.
+        * If the new data gets written to the file but the log buffers
+        * containing the free and reallocation don't, then we'd end up with
+        * garbage in the blocks being freed.  As long as we make the new size
+        * permanent before actually freeing any blocks it doesn't matter if
+        * they get written to.
+        */
+       ip->i_d.di_size = newsize;
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       if (newsize <= oldsize) {
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
                if (error)
                        goto out_trans_abort;
 
index 5cc3dde..eafbcff 100644 (file)
@@ -31,6 +31,7 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
 
-       error = xfs_itruncate_data(&tp, ip, 0);
+       ip->i_d.di_size = 0;
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
        if (error) {
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
                                     XFS_TRANS_ABORT);
                goto out_unlock;
        }
 
+       ASSERT(ip->i_d.di_nextents == 0);
+
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
index 281961c..ee5b695 100644 (file)
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
        /* xfs inode */
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
-       init_waitqueue_head(&ip->i_ipin_wait);
-       /*
-        * Because we want to use a counting completion, complete
-        * the flush completion once to allow a single access to
-        * the flush completion without blocking.
-        */
-       init_completion(&ip->i_flush);
-       complete(&ip->i_flush);
 
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", ip->i_ino);
index 72c01a1..40b75ee 100644 (file)
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
                return 1;
 
        /*
-        * do some unlocked checks first to avoid unnecessary lock traffic.
-        * The first is a flush lock check, the second is a already in reclaim
-        * check. Only do these checks if we are not going to block on locks.
+        * If we are asked for non-blocking operation, do unlocked checks to
+        * see if the inode already is being flushed or in reclaim to avoid
+        * lock traffic.
         */
        if ((flags & SYNC_TRYLOCK) &&
-           (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+           __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
                return 1;
-       }
 
        /*
         * The radix tree lock here protects a thread in xfs_iget from racing
index a9d5b1e..6b6df58 100644 (file)
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
-               __field(xfs_fsize_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->flags = flags;
        ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
                  "offset 0x%llx count 0x%zx ioflags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
-                 __entry->new_size,
                  __entry->offset,
                  __entry->count,
                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, size)
-               __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                 "offset 0x%llx count %zd type %s "
-                 "startoff 0x%llx startblock %lld blockcount 0x%llx",
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+                 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
-                 __entry->new_size,
                  __entry->offset,
                  __entry->count,
                  __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
                __field(xfs_ino_t, ino)
                __field(loff_t, isize)
                __field(loff_t, disize)
-               __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
-               __entry->isize = ip->i_size;
+               __entry->isize = VFS_I(ip)->i_size;
                __entry->disize = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
        ),
-       TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+       TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
                  "offset 0x%llx count %zd",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->isize,
                  __entry->disize,
-                 __entry->new_size,
                  __entry->offset,
                  __entry->count)
 );
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
 DEFINE_EVENT(xfs_itrunc_class, name, \
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
        TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
 
 TRACE_EVENT(xfs_pagecache_inval,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                __field(xfs_ino_t, ino)
                __field(int, format)
                __field(int, nex)
-               __field(int, max_nex)
                __field(int, broot_size)
                __field(int, fork_off)
        ),
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                __entry->ino = ip->i_ino;
                __entry->format = ip->i_d.di_format;
                __entry->nex = ip->i_d.di_nextents;
-               __entry->max_nex = ip->i_df.if_ext_max;
                __entry->broot_size = ip->i_df.if_broot_bytes;
                __entry->fork_off = XFS_IFORK_BOFF(ip);
        ),
        TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-                 "Max in-fork extents %d, broot size %d, fork offset %d",
+                 "broot size %d, fork offset %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
                  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
                  __entry->nex,
-                 __entry->max_nex,
                  __entry->broot_size,
                  __entry->fork_off)
 )
index f2fea86..0cf52da 100644 (file)
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
         * Figure out if there are any blocks beyond the end
         * of the file.  If not, then there is nothing to do.
         */
-       end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
        if (last_fsb <= end_fsb)
                return 0;
@@ -226,7 +226,14 @@ xfs_free_eofblocks(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
 
-               error = xfs_itruncate_data(&tp, ip, ip->i_size);
+               /*
+                * Do not update the on-disk file size.  If we update the
+                * on-disk file size and then the system crashes before the
+                * contents of the file are flushed to disk then the files
+                * may be full of holes (ie NULL files bug).
+                */
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+                                             XFS_ISIZE(ip));
                if (error) {
                        /*
                         * If we get an error at this point we simply don't
@@ -540,8 +547,8 @@ xfs_release(
                return 0;
 
        if ((S_ISREG(ip->i_d.di_mode) &&
-            ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-              ip->i_delayed_blks > 0)) &&
+            (VFS_I(ip)->i_size > 0 ||
+             (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
@@ -618,7 +625,7 @@ xfs_inactive(
         * only one with a reference to the inode.
         */
        truncate = ((ip->i_d.di_nlink == 0) &&
-           ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+           ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
             (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
            S_ISREG(ip->i_d.di_mode));
 
@@ -632,12 +639,12 @@ xfs_inactive(
 
        if (ip->i_d.di_nlink != 0) {
                if ((S_ISREG(ip->i_d.di_mode) &&
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-                    (!(ip->i_d.di_flags &
+                   (VFS_I(ip)->i_size > 0 ||
+                    (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+                   (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+                   (!(ip->i_d.di_flags &
                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-                     (ip->i_delayed_blks != 0)))) {
+                    ip->i_delayed_blks != 0))) {
                        error = xfs_free_eofblocks(mp, ip, 0);
                        if (error)
                                return VN_INACTIVE_CACHE;
@@ -670,13 +677,18 @@ xfs_inactive(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
 
-               error = xfs_itruncate_data(&tp, ip, 0);
+               ip->i_d.di_size = 0;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
                if (error) {
                        xfs_trans_cancel(tp,
                                XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
                        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
                        return VN_INACTIVE_CACHE;
                }
+
+               ASSERT(ip->i_d.di_nextents == 0);
        } else if (S_ISLNK(ip->i_d.di_mode)) {
 
                /*
@@ -1961,11 +1973,11 @@ xfs_zero_remaining_bytes(
         * since nothing can read beyond eof.  The space will
         * be zeroed when the file is extended anyway.
         */
-       if (startoff >= ip->i_size)
+       if (startoff >= XFS_ISIZE(ip))
                return 0;
 
-       if (endoff > ip->i_size)
-               endoff = ip->i_size;
+       if (endoff > XFS_ISIZE(ip))
+               endoff = XFS_ISIZE(ip);
 
        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2272,7 @@ xfs_change_file_space(
                bf->l_start += offset;
                break;
        case 2: /*SEEK_END*/
-               bf->l_start += ip->i_size;
+               bf->l_start += XFS_ISIZE(ip);
                break;
        default:
                return XFS_ERROR(EINVAL);
@@ -2277,7 +2289,7 @@ xfs_change_file_space(
        bf->l_whence = 0;
 
        startoffset = bf->l_start;
-       fsize = ip->i_size;
+       fsize = XFS_ISIZE(ip);
 
        /*
         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
index 426ab9f..9ff7a2c 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
+#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
  * AUDIT_UNUSED_BITS is updated if need be. */
 #define AUDIT_UNUSED_BITS      0x07FFFC00
 
+/* AUDIT_FIELD_COMPARE rule list */
+#define AUDIT_COMPARE_UID_TO_OBJ_UID   1
+#define AUDIT_COMPARE_GID_TO_OBJ_GID   2
+#define AUDIT_COMPARE_EUID_TO_OBJ_UID  3
+#define AUDIT_COMPARE_EGID_TO_OBJ_GID  4
+#define AUDIT_COMPARE_AUID_TO_OBJ_UID  5
+#define AUDIT_COMPARE_SUID_TO_OBJ_UID  6
+#define AUDIT_COMPARE_SGID_TO_OBJ_GID  7
+#define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8
+#define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9
+
+#define AUDIT_COMPARE_UID_TO_AUID      10
+#define AUDIT_COMPARE_UID_TO_EUID      11
+#define AUDIT_COMPARE_UID_TO_FSUID     12
+#define AUDIT_COMPARE_UID_TO_SUID      13
+
+#define AUDIT_COMPARE_AUID_TO_FSUID    14
+#define AUDIT_COMPARE_AUID_TO_SUID     15
+#define AUDIT_COMPARE_AUID_TO_EUID     16
+
+#define AUDIT_COMPARE_EUID_TO_SUID     17
+#define AUDIT_COMPARE_EUID_TO_FSUID    18
+
+#define AUDIT_COMPARE_SUID_TO_FSUID    19
+
+#define AUDIT_COMPARE_GID_TO_EGID      20
+#define AUDIT_COMPARE_GID_TO_FSGID     21
+#define AUDIT_COMPARE_GID_TO_SGID      22
+
+#define AUDIT_COMPARE_EGID_TO_FSGID    23
+#define AUDIT_COMPARE_EGID_TO_SGID     24
+#define AUDIT_COMPARE_SGID_TO_FSGID    25
+
+#define AUDIT_MAX_FIELD_COMPARE                AUDIT_COMPARE_SGID_TO_FSGID
 
 /* Rule fields */
                                /* These are useful when checking the
 #define AUDIT_PERM     106
 #define AUDIT_DIR      107
 #define AUDIT_FILETYPE 108
+#define AUDIT_OBJ_UID  109
+#define AUDIT_OBJ_GID  110
+#define AUDIT_FIELD_COMPARE    111
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -408,28 +446,24 @@ struct audit_field {
        void                            *lsm_rule;
 };
 
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
                                /* Public API */
-extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
-extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(int arch,
-                               int major, unsigned long a0, unsigned long a1,
-                               unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(int failed, long return_code);
+extern void __audit_free(struct task_struct *task);
+extern void __audit_syscall_entry(int arch,
+                                 int major, unsigned long a0, unsigned long a1,
+                                 unsigned long a2, unsigned long a3);
+extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
 extern void __audit_inode_child(const struct dentry *dentry,
                                const struct inode *parent);
+extern void __audit_seccomp(unsigned long syscall);
 extern void __audit_ptrace(struct task_struct *t);
 
 static inline int audit_dummy_context(void)
@@ -437,6 +471,27 @@ static inline int audit_dummy_context(void)
        void *p = current->audit_context;
        return !p || *(int *)p;
 }
+static inline void audit_free(struct task_struct *task)
+{
+       if (unlikely(task->audit_context))
+               __audit_free(task);
+}
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+                                      unsigned long a1, unsigned long a2,
+                                      unsigned long a3)
+{
+       if (unlikely(!audit_dummy_context()))
+               __audit_syscall_entry(arch, major, a0, a1, a2, a3);
+}
+static inline void audit_syscall_exit(void *pt_regs)
+{
+       if (unlikely(current->audit_context)) {
+               int success = is_syscall_success(pt_regs);
+               int return_code = regs_return_value(pt_regs);
+
+               __audit_syscall_exit(success, return_code);
+       }
+}
 static inline void audit_getname(const char *name)
 {
        if (unlikely(!audit_dummy_context()))
@@ -453,6 +508,12 @@ static inline void audit_inode_child(const struct dentry *dentry,
 }
 void audit_core_dumps(long signr);
 
+static inline void audit_seccomp(unsigned long syscall)
+{
+       if (unlikely(!audit_dummy_context()))
+               __audit_seccomp(syscall);
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 {
        if (unlikely(!audit_dummy_context()))
@@ -463,17 +524,16 @@ static inline void audit_ptrace(struct task_struct *t)
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
                              struct timespec *t, unsigned int *serial);
-extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
+extern int  audit_set_loginuid(uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
-extern int audit_bprm(struct linux_binprm *bprm);
-extern void audit_socketcall(int nargs, unsigned long *args);
-extern int audit_sockaddr(int len, void *addr);
+extern int __audit_bprm(struct linux_binprm *bprm);
+extern void __audit_socketcall(int nargs, unsigned long *args);
+extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
-extern int audit_set_macxattr(const char *name);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
 extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
@@ -499,6 +559,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
+static inline int audit_bprm(struct linux_binprm *bprm)
+{
+       if (unlikely(!audit_dummy_context()))
+               return __audit_bprm(bprm);
+       return 0;
+}
+static inline void audit_socketcall(int nargs, unsigned long *args)
+{
+       if (unlikely(!audit_dummy_context()))
+               __audit_socketcall(nargs, args);
+}
+static inline int audit_sockaddr(int len, void *addr)
+{
+       if (unlikely(!audit_dummy_context()))
+               return __audit_sockaddr(len, addr);
+       return 0;
+}
 static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
        if (unlikely(!audit_dummy_context()))
@@ -544,12 +621,11 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else
-#define audit_finish_fork(t)
+#else /* CONFIG_AUDITSYSCALL */
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_syscall_exit(r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -558,6 +634,7 @@ extern int audit_signals;
 #define audit_inode(n,d) do { (void)(d); } while (0)
 #define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
+#define audit_seccomp(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
@@ -568,7 +645,6 @@ extern int audit_signals;
 #define audit_socketcall(n,a) ((void)0)
 #define audit_fd_pair(n,a) ((void)0)
 #define audit_sockaddr(len, addr) ({ 0; })
-#define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ((void)0)
 #define audit_mq_sendrecv(d,l,p,t) ((void)0)
 #define audit_mq_notify(d,n) ((void)0)
@@ -579,7 +655,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif
+#endif /* CONFIG_AUDITSYSCALL */
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
index abc0120..9c07dce 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <linux/bug.h>
 #include <linux/atomic.h>
+#include <linux/kernel.h>
 
 struct kref {
        atomic_t refcount;
index a27e56c..c2f1f6a 100644 (file)
 
 #include <linux/compiler.h>            /* For unlikely.  */
 #include <linux/sched.h>               /* For struct task_struct.  */
+#include <linux/err.h>                 /* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -266,6 +267,15 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef is_syscall_success
+/*
+ * On most systems we can tell if a syscall is a success based on if the retval
+ * is an error value.  On some systems like ia64 and powerpc they have different
+ * indicators of success/failure and must define their own.
+ */
+#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
+#endif
+
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
index ecdaeb9..5cf6850 100644 (file)
@@ -312,7 +312,6 @@ struct tty_driver {
         */
        struct tty_struct **ttys;
        struct ktermios **termios;
-       struct ktermios **termios_locked;
        void *driver_state;
 
        /*
index b31702a..84f3001 100644 (file)
@@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
+struct btrfs_block_group_cache;
+struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
 
@@ -44,6 +46,17 @@ struct extent_buffer;
        obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||                \
              (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
 
+#define BTRFS_GROUP_FLAGS      \
+       { BTRFS_BLOCK_GROUP_DATA,       "DATA"}, \
+       { BTRFS_BLOCK_GROUP_SYSTEM,     "SYSTEM"}, \
+       { BTRFS_BLOCK_GROUP_METADATA,   "METADATA"}, \
+       { BTRFS_BLOCK_GROUP_RAID0,      "RAID0"}, \
+       { BTRFS_BLOCK_GROUP_RAID1,      "RAID1"}, \
+       { BTRFS_BLOCK_GROUP_DUP,        "DUP"}, \
+       { BTRFS_BLOCK_GROUP_RAID10,     "RAID10"}
+
+#define BTRFS_UUID_SIZE 16
+
 TRACE_EVENT(btrfs_transaction_commit,
 
        TP_PROTO(struct btrfs_root *root),
@@ -621,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block,
                  __entry->cow_level)
 );
 
+TRACE_EVENT(btrfs_space_reservation,
+
+       TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
+                u64 bytes, int reserve),
+
+       TP_ARGS(fs_info, type, val, bytes, reserve),
+
+       TP_STRUCT__entry(
+               __array(        u8,     fsid,   BTRFS_UUID_SIZE )
+               __string(       type,   type                    )
+               __field(        u64,    val                     )
+               __field(        u64,    bytes                   )
+               __field(        int,    reserve                 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+               __assign_str(type, type);
+               __entry->val            = val;
+               __entry->bytes          = bytes;
+               __entry->reserve        = reserve;
+       ),
+
+       TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
+                 __entry->val, __entry->reserve ? "reserve" : "release",
+                 __entry->bytes)
+);
+
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
        TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
@@ -659,6 +700,168 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
        TP_ARGS(root, start, len)
 );
 
+TRACE_EVENT(find_free_extent,
+
+       TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size,
+                u64 data),
+
+       TP_ARGS(root, num_bytes, empty_size, data),
+
+       TP_STRUCT__entry(
+               __field(        u64,    root_objectid           )
+               __field(        u64,    num_bytes               )
+               __field(        u64,    empty_size              )
+               __field(        u64,    data                    )
+       ),
+
+       TP_fast_assign(
+               __entry->root_objectid  = root->root_key.objectid;
+               __entry->num_bytes      = num_bytes;
+               __entry->empty_size     = empty_size;
+               __entry->data           = data;
+       ),
+
+       TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
+                 "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
+                 __entry->num_bytes, __entry->empty_size, __entry->data,
+                 __print_flags((unsigned long)__entry->data, "|",
+                                BTRFS_GROUP_FLAGS))
+);
+
+DECLARE_EVENT_CLASS(btrfs__reserve_extent,
+
+       TP_PROTO(struct btrfs_root *root,
+                struct btrfs_block_group_cache *block_group, u64 start,
+                u64 len),
+
+       TP_ARGS(root, block_group, start, len),
+
+       TP_STRUCT__entry(
+               __field(        u64,    root_objectid           )
+               __field(        u64,    bg_objectid             )
+               __field(        u64,    flags                   )
+               __field(        u64,    start                   )
+               __field(        u64,    len                     )
+       ),
+
+       TP_fast_assign(
+               __entry->root_objectid  = root->root_key.objectid;
+               __entry->bg_objectid    = block_group->key.objectid;
+               __entry->flags          = block_group->flags;
+               __entry->start          = start;
+               __entry->len            = len;
+       ),
+
+       TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
+                 "start = %Lu, len = %Lu",
+                 show_root_type(__entry->root_objectid), __entry->bg_objectid,
+                 __entry->flags, __print_flags((unsigned long)__entry->flags,
+                                               "|", BTRFS_GROUP_FLAGS),
+                 __entry->start, __entry->len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
+
+       TP_PROTO(struct btrfs_root *root,
+                struct btrfs_block_group_cache *block_group, u64 start,
+                u64 len),
+
+       TP_ARGS(root, block_group, start, len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
+
+       TP_PROTO(struct btrfs_root *root,
+                struct btrfs_block_group_cache *block_group, u64 start,
+                u64 len),
+
+       TP_ARGS(root, block_group, start, len)
+);
+
+TRACE_EVENT(btrfs_find_cluster,
+
+       TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
+                u64 bytes, u64 empty_size, u64 min_bytes),
+
+       TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
+
+       TP_STRUCT__entry(
+               __field(        u64,    bg_objectid             )
+               __field(        u64,    flags                   )
+               __field(        u64,    start                   )
+               __field(        u64,    bytes                   )
+               __field(        u64,    empty_size              )
+               __field(        u64,    min_bytes               )
+       ),
+
+       TP_fast_assign(
+               __entry->bg_objectid    = block_group->key.objectid;
+               __entry->flags          = block_group->flags;
+               __entry->start          = start;
+               __entry->bytes          = bytes;
+               __entry->empty_size     = empty_size;
+               __entry->min_bytes      = min_bytes;
+       ),
+
+       TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
+                 " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
+                 __entry->flags,
+                 __print_flags((unsigned long)__entry->flags, "|",
+                               BTRFS_GROUP_FLAGS), __entry->start,
+                 __entry->bytes, __entry->empty_size,  __entry->min_bytes)
+);
+
+TRACE_EVENT(btrfs_failed_cluster_setup,
+
+       TP_PROTO(struct btrfs_block_group_cache *block_group),
+
+       TP_ARGS(block_group),
+
+       TP_STRUCT__entry(
+               __field(        u64,    bg_objectid             )
+       ),
+
+       TP_fast_assign(
+               __entry->bg_objectid    = block_group->key.objectid;
+       ),
+
+       TP_printk("block_group = %Lu", __entry->bg_objectid)
+);
+
+TRACE_EVENT(btrfs_setup_cluster,
+
+       TP_PROTO(struct btrfs_block_group_cache *block_group,
+                struct btrfs_free_cluster *cluster, u64 size, int bitmap),
+
+       TP_ARGS(block_group, cluster, size, bitmap),
+
+       TP_STRUCT__entry(
+               __field(        u64,    bg_objectid             )
+               __field(        u64,    flags                   )
+               __field(        u64,    start                   )
+               __field(        u64,    max_size                )
+               __field(        u64,    size                    )
+               __field(        int,    bitmap                  )
+       ),
+
+       TP_fast_assign(
+               __entry->bg_objectid    = block_group->key.objectid;
+               __entry->flags          = block_group->flags;
+               __entry->start          = cluster->window_start;
+               __entry->max_size       = cluster->max_size;
+               __entry->size           = size;
+               __entry->bitmap         = bitmap;
+       ),
+
+       TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
+                 "size = %Lu, max_size = %Lu, bitmap = %d",
+                 __entry->bg_objectid,
+                 __entry->flags,
+                 __print_flags((unsigned long)__entry->flags, "|",
+                               BTRFS_GROUP_FLAGS), __entry->start,
+                 __entry->size, __entry->max_size, __entry->bitmap)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
index 6ac2236..3f42cd6 100644 (file)
@@ -355,7 +355,7 @@ config AUDIT
 
 config AUDITSYSCALL
        bool "Enable system-call auditing support"
-       depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH)
+       depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM)
        default y if SECURITY_SELINUX
        help
          Enable low-overhead system-call auditing infrastructure that
@@ -372,6 +372,20 @@ config AUDIT_TREE
        depends on AUDITSYSCALL
        select FSNOTIFY
 
+config AUDIT_LOGINUID_IMMUTABLE
+       bool "Make audit loginuid immutable"
+       depends on AUDIT
+       help
+         The config option toggles if a task setting its loginuid requires
+         CAP_SYS_AUDITCONTROL or if that task should require no special permissions
+         but should instead only allow setting its loginuid if it was never
+         previously set.  On systems which use systemd or a similar central
+         process to restart login services this should be set to true.  On older
+         systems in which an admin would typically have to directly stop and
+         start processes this should be set to false.  Setting this to true allows
+         one to drop potentially dangerous capabilites from the login tasks,
+         but may not be backwards compatible with older init systems.
+
 source "kernel/irq/Kconfig"
 
 menu "RCU Subsystem"
index 57e3f51..bb0eb5b 100644 (file)
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        }
 
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-       audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+       audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
                         pid, uid, auid, ses);
        if (sid) {
                rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        char *p, *pathname;
 
        if (prefix)
-               audit_log_format(ab, " %s", prefix);
+               audit_log_format(ab, "%s", prefix);
 
        /* We will allow 11 spaces for ' (deleted)' to be appended */
        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
index 91e7071..8167668 100644 (file)
@@ -36,12 +36,8 @@ enum audit_state {
        AUDIT_DISABLED,         /* Do not create per-task audit_context.
                                 * No syscall-specific audit records can
                                 * be generated. */
-       AUDIT_SETUP_CONTEXT,    /* Create the per-task audit_context,
-                                * but don't necessarily fill it in at
-                                * syscall entry time (i.e., filter
-                                * instead). */
        AUDIT_BUILD_CONTEXT,    /* Create the per-task audit_context,
-                                * and always fill it in at syscall
+                                * and fill it in at syscall
                                 * entry time.  This makes a full
                                 * syscall record available if some
                                 * other part of the kernel decides it
index f8277c8..a6c3f1a 100644 (file)
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
        switch(listnr) {
        default:
                goto exit_err;
-       case AUDIT_FILTER_USER:
-       case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
        case AUDIT_FILTER_ENTRY:
+               if (rule->action == AUDIT_ALWAYS)
+                       goto exit_err;
        case AUDIT_FILTER_EXIT:
        case AUDIT_FILTER_TASK:
 #endif
+       case AUDIT_FILTER_USER:
+       case AUDIT_FILTER_TYPE:
                ;
        }
        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                       if ((f->val & ~S_IFMT) > S_IFMT)
+                       if (f->val & ~S_IFMT)
                                goto exit_free;
                        break;
                case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                case AUDIT_ARG1:
                case AUDIT_ARG2:
                case AUDIT_ARG3:
+               case AUDIT_OBJ_UID:
+               case AUDIT_OBJ_GID:
                        break;
                case AUDIT_ARCH:
                        entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILTERKEY:
-                       err = -EINVAL;
                        if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                       if ((f->val & ~S_IFMT) > S_IFMT)
+                       if (f->val & ~S_IFMT)
+                               goto exit_free;
+                       break;
+               case AUDIT_FIELD_COMPARE:
+                       if (f->val > AUDIT_MAX_FIELD_COMPARE)
                                goto exit_free;
                        break;
                default:
index e7fe2b0..caaea6e 100644 (file)
 
 #include "audit.h"
 
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES    20
+ * for saving names from getname().  If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES    5
 
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
+       struct list_head list;          /* audit_context->names_list */
        const char      *name;
-       int             name_len;       /* number of name's characters to log */
-       unsigned        name_put;       /* call __putname() for this name */
        unsigned long   ino;
        dev_t           dev;
        umode_t         mode;
@@ -113,6 +118,14 @@ struct audit_names {
        u32             osid;
        struct audit_cap_data fcap;
        unsigned int    fcap_ver;
+       int             name_len;       /* number of name's characters to log */
+       bool            name_put;       /* call __putname() for this name */
+       /*
+        * This was an allocated audit_names and not from the array of
+        * names allocated in the task audit context.  Thus this name
+        * should be freed on syscall exit
+        */
+       bool            should_free;
 };
 
 struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
        long                return_code;/* syscall return code */
        u64                 prio;
        int                 return_valid; /* return code is valid */
-       int                 name_count;
-       struct audit_names  names[AUDIT_NAMES];
+       /*
+        * The names_list is the list of all audit_names collected during this
+        * syscall.  The first AUDIT_NAMES entries in the names_list will
+        * actually be from the preallocated_names array for performance
+        * reasons.  Except during allocation they should never be referenced
+        * through the preallocated_names array and should only be found/used
+        * by running the names_list.
+        */
+       struct audit_names  preallocated_names[AUDIT_NAMES];
+       int                 name_count; /* total records in names_list */
+       struct list_head    names_list; /* anchor for struct audit_names->list */
        char *              filterkey;  /* key for rule that triggered record */
        struct path         pwd;
        struct audit_context *previous; /* For nested syscalls */
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
        }
 }
 
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-       unsigned index = which & ~S_IFMT;
-       umode_t mode = which & S_IFMT;
+       struct audit_names *n;
+       umode_t mode = (umode_t)val;
 
        if (unlikely(!ctx))
                return 0;
 
-       if (index >= ctx->name_count)
-               return 0;
-       if (ctx->names[index].ino == -1)
-               return 0;
-       if ((ctx->names[index].mode ^ mode) & S_IFMT)
-               return 0;
-       return 1;
+       list_for_each_entry(n, &ctx->names_list, list) {
+               if ((n->ino != -1) &&
+                   ((n->mode & S_IFMT) == mode))
+                       return 1;
+       }
+
+       return 0;
 }
 
 /*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
        return 0;
 }
 
+static int audit_compare_id(uid_t uid1,
+                           struct audit_names *name,
+                           unsigned long name_offset,
+                           struct audit_field *f,
+                           struct audit_context *ctx)
+{
+       struct audit_names *n;
+       unsigned long addr;
+       uid_t uid2;
+       int rc;
+
+       BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+
+       if (name) {
+               addr = (unsigned long)name;
+               addr += name_offset;
+
+               uid2 = *(uid_t *)addr;
+               rc = audit_comparator(uid1, f->op, uid2);
+               if (rc)
+                       return rc;
+       }
+
+       if (ctx) {
+               list_for_each_entry(n, &ctx->names_list, list) {
+                       addr = (unsigned long)n;
+                       addr += name_offset;
+
+                       uid2 = *(uid_t *)addr;
+
+                       rc = audit_comparator(uid1, f->op, uid2);
+                       if (rc)
+                               return rc;
+               }
+       }
+       return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+                              const struct cred *cred,
+                              struct audit_field *f,
+                              struct audit_context *ctx,
+                              struct audit_names *name)
+{
+       switch (f->val) {
+       /* process to file object comparisons */
+       case AUDIT_COMPARE_UID_TO_OBJ_UID:
+               return audit_compare_id(cred->uid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_GID_TO_OBJ_GID:
+               return audit_compare_id(cred->gid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+               return audit_compare_id(cred->euid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+               return audit_compare_id(cred->egid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+               return audit_compare_id(tsk->loginuid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+               return audit_compare_id(cred->suid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+               return audit_compare_id(cred->sgid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+               return audit_compare_id(cred->fsuid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+               return audit_compare_id(cred->fsgid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       /* uid comparisons */
+       case AUDIT_COMPARE_UID_TO_AUID:
+               return audit_comparator(cred->uid, f->op, tsk->loginuid);
+       case AUDIT_COMPARE_UID_TO_EUID:
+               return audit_comparator(cred->uid, f->op, cred->euid);
+       case AUDIT_COMPARE_UID_TO_SUID:
+               return audit_comparator(cred->uid, f->op, cred->suid);
+       case AUDIT_COMPARE_UID_TO_FSUID:
+               return audit_comparator(cred->uid, f->op, cred->fsuid);
+       /* auid comparisons */
+       case AUDIT_COMPARE_AUID_TO_EUID:
+               return audit_comparator(tsk->loginuid, f->op, cred->euid);
+       case AUDIT_COMPARE_AUID_TO_SUID:
+               return audit_comparator(tsk->loginuid, f->op, cred->suid);
+       case AUDIT_COMPARE_AUID_TO_FSUID:
+               return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+       /* euid comparisons */
+       case AUDIT_COMPARE_EUID_TO_SUID:
+               return audit_comparator(cred->euid, f->op, cred->suid);
+       case AUDIT_COMPARE_EUID_TO_FSUID:
+               return audit_comparator(cred->euid, f->op, cred->fsuid);
+       /* suid comparisons */
+       case AUDIT_COMPARE_SUID_TO_FSUID:
+               return audit_comparator(cred->suid, f->op, cred->fsuid);
+       /* gid comparisons */
+       case AUDIT_COMPARE_GID_TO_EGID:
+               return audit_comparator(cred->gid, f->op, cred->egid);
+       case AUDIT_COMPARE_GID_TO_SGID:
+               return audit_comparator(cred->gid, f->op, cred->sgid);
+       case AUDIT_COMPARE_GID_TO_FSGID:
+               return audit_comparator(cred->gid, f->op, cred->fsgid);
+       /* egid comparisons */
+       case AUDIT_COMPARE_EGID_TO_SGID:
+               return audit_comparator(cred->egid, f->op, cred->sgid);
+       case AUDIT_COMPARE_EGID_TO_FSGID:
+               return audit_comparator(cred->egid, f->op, cred->fsgid);
+       /* sgid comparison */
+       case AUDIT_COMPARE_SGID_TO_FSGID:
+               return audit_comparator(cred->sgid, f->op, cred->fsgid);
+       default:
+               WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
+               return 0;
+       }
+       return 0;
+}
+
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                              bool task_creation)
 {
        const struct cred *cred;
-       int i, j, need_sid = 1;
+       int i, need_sid = 1;
        u32 sid;
 
        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
 
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
+               struct audit_names *n;
                int result = 0;
 
                switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMAJOR:
-                       if (name)
-                               result = audit_comparator(MAJOR(name->dev),
-                                                         f->op, f->val);
-                       else if (ctx) {
-                               for (j = 0; j < ctx->name_count; j++) {
-                                       if (audit_comparator(MAJOR(ctx->names[j].dev),  f->op, f->val)) {
+                       if (name) {
+                               if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+                                   audit_comparator(MAJOR(name->rdev), f->op, f->val))
+                                       ++result;
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+                                           audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMINOR:
-                       if (name)
-                               result = audit_comparator(MINOR(name->dev),
-                                                         f->op, f->val);
-                       else if (ctx) {
-                               for (j = 0; j < ctx->name_count; j++) {
-                                       if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+                       if (name) {
+                               if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+                                   audit_comparator(MINOR(name->rdev), f->op, f->val))
+                                       ++result;
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+                                           audit_comparator(MINOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
                        if (name)
                                result = (name->ino == f->val);
                        else if (ctx) {
-                               for (j = 0; j < ctx->name_count; j++) {
-                                       if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(n->ino, f->op, f->val)) {
+                                               ++result;
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case AUDIT_OBJ_UID:
+                       if (name) {
+                               result = audit_comparator(name->uid, f->op, f->val);
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(n->uid, f->op, f->val)) {
+                                               ++result;
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case AUDIT_OBJ_GID:
+                       if (name) {
+                               result = audit_comparator(name->gid, f->op, f->val);
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(n->gid, f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                                                   name->osid, f->type, f->op,
                                                   f->lsm_rule, ctx);
                                } else if (ctx) {
-                                       for (j = 0; j < ctx->name_count; j++) {
-                                               if (security_audit_rule_match(
-                                                     ctx->names[j].osid,
-                                                     f->type, f->op,
-                                                     f->lsm_rule, ctx)) {
+                                       list_for_each_entry(n, &ctx->names_list, list) {
+                                               if (security_audit_rule_match(n->osid, f->type,
+                                                                             f->op, f->lsm_rule,
+                                                                             ctx)) {
                                                        ++result;
                                                        break;
                                                }
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_FILETYPE:
                        result = audit_match_filetype(ctx, f->val);
                        break;
+               case AUDIT_FIELD_COMPARE:
+                       result = audit_field_compare(tsk, cred, f, ctx, name);
+                       break;
                }
-
                if (!result)
                        return 0;
        }
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        return AUDIT_BUILD_CONTEXT;
 }
 
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+                                  struct audit_names *n,
+                                  struct audit_context *ctx) {
+       int word, bit;
+       int h = audit_hash_ino((u32)n->ino);
+       struct list_head *list = &audit_inode_hash[h];
+       struct audit_entry *e;
+       enum audit_state state;
+
+       word = AUDIT_WORD(ctx->major);
+       bit  = AUDIT_BIT(ctx->major);
+
+       if (list_empty(list))
+               return 0;
+
+       list_for_each_entry_rcu(e, list, list) {
+               if ((e->rule.mask[word] & bit) == bit &&
+                   audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+                       ctx->current_state = state;
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
  * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-       int i;
-       struct audit_entry *e;
-       enum audit_state state;
+       struct audit_names *n;
 
        if (audit_pid && tsk->tgid == audit_pid)
                return;
 
        rcu_read_lock();
-       for (i = 0; i < ctx->name_count; i++) {
-               int word = AUDIT_WORD(ctx->major);
-               int bit  = AUDIT_BIT(ctx->major);
-               struct audit_names *n = &ctx->names[i];
-               int h = audit_hash_ino((u32)n->ino);
-               struct list_head *list = &audit_inode_hash[h];
-
-               if (list_empty(list))
-                       continue;
 
-               list_for_each_entry_rcu(e, list, list) {
-                       if ((e->rule.mask[word] & bit) == bit &&
-                           audit_filter_rules(tsk, &e->rule, ctx, n,
-                                              &state, false)) {
-                               rcu_read_unlock();
-                               ctx->current_state = state;
-                               return;
-                       }
-               }
+       list_for_each_entry(n, &ctx->names_list, list) {
+               if (audit_filter_inode_name(tsk, n, ctx))
+                       break;
        }
        rcu_read_unlock();
 }
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
        struct audit_context *context = tsk->audit_context;
 
-       if (likely(!context))
+       if (!context)
                return NULL;
        context->return_valid = return_valid;
 
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 static inline void audit_free_names(struct audit_context *context)
 {
-       int i;
+       struct audit_names *n, *next;
 
 #if AUDIT_DEBUG == 2
        if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
                       context->serial, context->major, context->in_syscall,
                       context->name_count, context->put_count,
                       context->ino_count);
-               for (i = 0; i < context->name_count; i++) {
+               list_for_each_entry(n, &context->names_list, list) {
                        printk(KERN_ERR "names[%d] = %p = %s\n", i,
-                              context->names[i].name,
-                              context->names[i].name ?: "(null)");
+                              n->name, n->name ?: "(null)");
                }
                dump_stack();
                return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
        context->ino_count  = 0;
 #endif
 
-       for (i = 0; i < context->name_count; i++) {
-               if (context->names[i].name && context->names[i].name_put)
-                       __putname(context->names[i].name);
+       list_for_each_entry_safe(n, next, &context->names_list, list) {
+               list_del(&n->list);
+               if (n->name && n->name_put)
+                       __putname(n->name);
+               if (n->should_free)
+                       kfree(n);
        }
        context->name_count = 0;
        path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
                return NULL;
        audit_zero_context(context, state);
        INIT_LIST_HEAD(&context->killed_trees);
+       INIT_LIST_HEAD(&context->names_list);
        return context;
 }
 
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
                return 0; /* Return if not auditing. */
 
        state = audit_filter_task(tsk, &key);
-       if (likely(state == AUDIT_DISABLED))
+       if (state == AUDIT_DISABLED)
                return 0;
 
        if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
                while (vma) {
                        if ((vma->vm_flags & VM_EXECUTABLE) &&
                            vma->vm_file) {
-                               audit_log_d_path(ab, "exe=",
+                               audit_log_d_path(ab, " exe=",
                                                 &vma->vm_file->f_path);
                                break;
                        }
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
                                  struct audit_buffer **ab,
                                  struct audit_aux_data_execve *axi)
 {
-       int i;
-       size_t len, len_sent = 0;
+       int i, len;
+       size_t len_sent = 0;
        const char __user *p;
        char *buf;
 
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
        audit_log_end(ab);
 }
 
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+                          int record_num, int *call_panic)
+{
+       struct audit_buffer *ab;
+       ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+       if (!ab)
+               return; /* audit_panic has been called */
+
+       audit_log_format(ab, "item=%d", record_num);
+
+       if (n->name) {
+               switch (n->name_len) {
+               case AUDIT_NAME_FULL:
+                       /* log the full path */
+                       audit_log_format(ab, " name=");
+                       audit_log_untrustedstring(ab, n->name);
+                       break;
+               case 0:
+                       /* name was specified as a relative path and the
+                        * directory component is the cwd */
+                       audit_log_d_path(ab, " name=", &context->pwd);
+                       break;
+               default:
+                       /* log the name's directory component */
+                       audit_log_format(ab, " name=");
+                       audit_log_n_untrustedstring(ab, n->name,
+                                                   n->name_len);
+               }
+       } else
+               audit_log_format(ab, " name=(null)");
+
+       if (n->ino != (unsigned long)-1) {
+               audit_log_format(ab, " inode=%lu"
+                                " dev=%02x:%02x mode=%#ho"
+                                " ouid=%u ogid=%u rdev=%02x:%02x",
+                                n->ino,
+                                MAJOR(n->dev),
+                                MINOR(n->dev),
+                                n->mode,
+                                n->uid,
+                                n->gid,
+                                MAJOR(n->rdev),
+                                MINOR(n->rdev));
+       }
+       if (n->osid != 0) {
+               char *ctx = NULL;
+               u32 len;
+               if (security_secid_to_secctx(
+                       n->osid, &ctx, &len)) {
+                       audit_log_format(ab, " osid=%u", n->osid);
+                       *call_panic = 2;
+               } else {
+                       audit_log_format(ab, " obj=%s", ctx);
+                       security_release_secctx(ctx, len);
+               }
+       }
+
+       audit_log_fcaps(ab, n);
+
+       audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
        const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
        const char *tty;
+       struct audit_names *n;
 
        /* tsk == current */
        context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        if (context->pwd.dentry && context->pwd.mnt) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
-                       audit_log_d_path(ab, "cwd=", &context->pwd);
+                       audit_log_d_path(ab, " cwd=", &context->pwd);
                        audit_log_end(ab);
                }
        }
-       for (i = 0; i < context->name_count; i++) {
-               struct audit_names *n = &context->names[i];
 
-               ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-               if (!ab)
-                       continue; /* audit_panic has been called */
-
-               audit_log_format(ab, "item=%d", i);
-
-               if (n->name) {
-                       switch(n->name_len) {
-                       case AUDIT_NAME_FULL:
-                               /* log the full path */
-                               audit_log_format(ab, " name=");
-                               audit_log_untrustedstring(ab, n->name);
-                               break;
-                       case 0:
-                               /* name was specified as a relative path and the
-                                * directory component is the cwd */
-                               audit_log_d_path(ab, "name=", &context->pwd);
-                               break;
-                       default:
-                               /* log the name's directory component */
-                               audit_log_format(ab, " name=");
-                               audit_log_n_untrustedstring(ab, n->name,
-                                                           n->name_len);
-                       }
-               } else
-                       audit_log_format(ab, " name=(null)");
-
-               if (n->ino != (unsigned long)-1) {
-                       audit_log_format(ab, " inode=%lu"
-                                        " dev=%02x:%02x mode=%#ho"
-                                        " ouid=%u ogid=%u rdev=%02x:%02x",
-                                        n->ino,
-                                        MAJOR(n->dev),
-                                        MINOR(n->dev),
-                                        n->mode,
-                                        n->uid,
-                                        n->gid,
-                                        MAJOR(n->rdev),
-                                        MINOR(n->rdev));
-               }
-               if (n->osid != 0) {
-                       char *ctx = NULL;
-                       u32 len;
-                       if (security_secid_to_secctx(
-                               n->osid, &ctx, &len)) {
-                               audit_log_format(ab, " osid=%u", n->osid);
-                               call_panic = 2;
-                       } else {
-                               audit_log_format(ab, " obj=%s", ctx);
-                               security_release_secctx(ctx, len);
-                       }
-               }
-
-               audit_log_fcaps(ab, n);
-
-               audit_log_end(ab);
-       }
+       i = 0;
+       list_for_each_entry(n, &context->names_list, list)
+               audit_log_name(context, n, i++, &call_panic);
 
        /* Send end of event record to help user space know we are finished */
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
  *
  * Called from copy_process and do_exit
  */
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
 {
        struct audit_context *context;
 
        context = audit_get_context(tsk, 0, 0);
-       if (likely(!context))
+       if (!context)
                return;
 
        /* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
 
-       if (unlikely(!context))
+       if (!context)
                return;
 
        /*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
                context->prio = 0;
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
        }
-       if (likely(state == AUDIT_DISABLED))
+       if (state == AUDIT_DISABLED)
                return;
 
        context->serial     = 0;
@@ -1658,30 +1861,9 @@ void audit_syscall_entry(int arch, int major,
        context->ppid       = 0;
 }
 
-void audit_finish_fork(struct task_struct *child)
-{
-       struct audit_context *ctx = current->audit_context;
-       struct audit_context *p = child->audit_context;
-       if (!p || !ctx)
-               return;
-       if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
-               return;
-       p->arch = ctx->arch;
-       p->major = ctx->major;
-       memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
-       p->ctime = ctx->ctime;
-       p->dummy = ctx->dummy;
-       p->in_syscall = ctx->in_syscall;
-       p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
-       p->ppid = current->pid;
-       p->prio = ctx->prio;
-       p->current_state = ctx->current_state;
-}
-
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @pt_regs: syscall registers
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1689,14 +1871,18 @@ void audit_finish_fork(struct task_struct *child)
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
        struct task_struct *tsk = current;
        struct audit_context *context;
 
-       context = audit_get_context(tsk, valid, return_code);
+       if (success)
+               success = AUDITSC_SUCCESS;
+       else
+               success = AUDITSC_FAILURE;
 
-       if (likely(!context))
+       context = audit_get_context(tsk, success, return_code);
+       if (!context)
                return;
 
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2007,30 @@ retry:
 #endif
 }
 
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+       struct audit_names *aname;
+
+       if (context->name_count < AUDIT_NAMES) {
+               aname = &context->preallocated_names[context->name_count];
+               memset(aname, 0, sizeof(*aname));
+       } else {
+               aname = kzalloc(sizeof(*aname), GFP_NOFS);
+               if (!aname)
+                       return NULL;
+               aname->should_free = true;
+       }
+
+       aname->ino = (unsigned long)-1;
+       list_add_tail(&aname->list, &context->names_list);
+
+       context->name_count++;
+#if AUDIT_DEBUG
+       context->ino_count++;
+#endif
+       return aname;
+}
+
 /**
  * audit_getname - add a name to the list
  * @name: name to add
@@ -1831,9 +2041,7 @@ retry:
 void __audit_getname(const char *name)
 {
        struct audit_context *context = current->audit_context;
-
-       if (IS_ERR(name) || !name)
-               return;
+       struct audit_names *n;
 
        if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
@@ -1843,13 +2051,15 @@ void __audit_getname(const char *name)
 #endif
                return;
        }
-       BUG_ON(context->name_count >= AUDIT_NAMES);
-       context->names[context->name_count].name = name;
-       context->names[context->name_count].name_len = AUDIT_NAME_FULL;
-       context->names[context->name_count].name_put = 1;
-       context->names[context->name_count].ino  = (unsigned long)-1;
-       context->names[context->name_count].osid = 0;
-       ++context->name_count;
+
+       n = audit_alloc_name(context);
+       if (!n)
+               return;
+
+       n->name = name;
+       n->name_len = AUDIT_NAME_FULL;
+       n->name_put = true;
+
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
 }
@@ -1871,12 +2081,13 @@ void audit_putname(const char *name)
                printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
                       __FILE__, __LINE__, context->serial, name);
                if (context->name_count) {
+                       struct audit_names *n;
                        int i;
-                       for (i = 0; i < context->name_count; i++)
+
+                       list_for_each_entry(n, &context->names_list, list)
                                printk(KERN_ERR "name[%d] = %p = %s\n", i,
-                                      context->names[i].name,
-                                      context->names[i].name ?: "(null)");
-               }
+                                      n->name, n->name ?: "(null)");
+                       }
 #endif
                __putname(name);
        }
@@ -1897,39 +2108,11 @@ void audit_putname(const char *name)
 #endif
 }
 
-static int audit_inc_name_count(struct audit_context *context,
-                               const struct inode *inode)
-{
-       if (context->name_count >= AUDIT_NAMES) {
-               if (inode)
-                       printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
-                              "dev=%02x:%02x, inode=%lu\n",
-                              MAJOR(inode->i_sb->s_dev),
-                              MINOR(inode->i_sb->s_dev),
-                              inode->i_ino);
-
-               else
-                       printk(KERN_DEBUG "name_count maxed, losing inode data\n");
-               return 1;
-       }
-       context->name_count++;
-#if AUDIT_DEBUG
-       context->ino_count++;
-#endif
-       return 0;
-}
-
-
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
        struct cpu_vfs_cap_data caps;
        int rc;
 
-       memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
-       memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
-       name->fcap.fE = 0;
-       name->fcap_ver = 0;
-
        if (!dentry)
                return 0;
 
@@ -1969,30 +2152,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
  */
 void __audit_inode(const char *name, const struct dentry *dentry)
 {
-       int idx;
        struct audit_context *context = current->audit_context;
        const struct inode *inode = dentry->d_inode;
+       struct audit_names *n;
 
        if (!context->in_syscall)
                return;
-       if (context->name_count
-           && context->names[context->name_count-1].name
-           && context->names[context->name_count-1].name == name)
-               idx = context->name_count - 1;
-       else if (context->name_count > 1
-                && context->names[context->name_count-2].name
-                && context->names[context->name_count-2].name == name)
-               idx = context->name_count - 2;
-       else {
-               /* FIXME: how much do we care about inodes that have no
-                * associated name? */
-               if (audit_inc_name_count(context, inode))
-                       return;
-               idx = context->name_count - 1;
-               context->names[idx].name = NULL;
+
+       list_for_each_entry_reverse(n, &context->names_list, list) {
+               if (n->name && (n->name == name))
+                       goto out;
        }
+
+       /* unable to find the name from a previous getname() */
+       n = audit_alloc_name(context);
+       if (!n)
+               return;
+out:
        handle_path(dentry);
-       audit_copy_inode(&context->names[idx], dentry, inode);
+       audit_copy_inode(n, dentry, inode);
 }
 
 /**
@@ -2011,11 +2189,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
-       int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
        const char *dname = dentry->d_name.name;
+       struct audit_names *n;
        int dirlen = 0;
 
        if (!context->in_syscall)
@@ -2025,9 +2203,7 @@ void __audit_inode_child(const struct dentry *dentry,
                handle_one(inode);
 
        /* parent is more likely, look for it first */
-       for (idx = 0; idx < context->name_count; idx++) {
-               struct audit_names *n = &context->names[idx];
-
+       list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
 
@@ -2040,9 +2216,7 @@ void __audit_inode_child(const struct dentry *dentry,
        }
 
        /* no matching parent, look for matching child */
-       for (idx = 0; idx < context->name_count; idx++) {
-               struct audit_names *n = &context->names[idx];
-
+       list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
 
@@ -2060,34 +2234,29 @@ void __audit_inode_child(const struct dentry *dentry,
 
 add_names:
        if (!found_parent) {
-               if (audit_inc_name_count(context, parent))
+               n = audit_alloc_name(context);
+               if (!n)
                        return;
-               idx = context->name_count - 1;
-               context->names[idx].name = NULL;
-               audit_copy_inode(&context->names[idx], NULL, parent);
+               audit_copy_inode(n, NULL, parent);
        }
 
        if (!found_child) {
-               if (audit_inc_name_count(context, inode))
+               n = audit_alloc_name(context);
+               if (!n)
                        return;
-               idx = context->name_count - 1;
 
                /* Re-use the name belonging to the slot for a matching parent
                 * directory. All names for this context are relinquished in
                 * audit_free_names() */
                if (found_parent) {
-                       context->names[idx].name = found_parent;
-                       context->names[idx].name_len = AUDIT_NAME_FULL;
+                       n->name = found_parent;
+                       n->name_len = AUDIT_NAME_FULL;
                        /* don't call __putname() */
-                       context->names[idx].name_put = 0;
-               } else {
-                       context->names[idx].name = NULL;
+                       n->name_put = false;
                }
 
                if (inode)
-                       audit_copy_inode(&context->names[idx], NULL, inode);
-               else
-                       context->names[idx].ino = (unsigned long)-1;
+                       audit_copy_inode(n, NULL, inode);
        }
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2290,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 
 /**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
  * @loginuid: loginuid value
  *
  * Returns 0.
  *
  * Called (set) from fs/proc/base.c::proc_loginuid_write().
  */
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
 {
-       unsigned int sessionid = atomic_inc_return(&session_id);
+       struct task_struct *task = current;
        struct audit_context *context = task->audit_context;
+       unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+       if (task->loginuid != -1)
+               return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+       if (!capable(CAP_AUDIT_CONTROL))
+               return -EPERM;
+#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 
+       sessionid = atomic_inc_return(&session_id);
        if (context && context->in_syscall) {
                struct audit_buffer *ab;
 
@@ -2271,14 +2449,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
        context->ipc.has_perm = 1;
 }
 
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
 {
        struct audit_aux_data_execve *ax;
        struct audit_context *context = current->audit_context;
 
-       if (likely(!audit_enabled || !context || context->dummy))
-               return 0;
-
        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
        if (!ax)
                return -ENOMEM;
@@ -2299,13 +2474,10 @@ int audit_bprm(struct linux_binprm *bprm)
  * @args: args array
  *
  */
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
 {
        struct audit_context *context = current->audit_context;
 
-       if (likely(!context || context->dummy))
-               return;
-
        context->type = AUDIT_SOCKETCALL;
        context->socketcall.nargs = nargs;
        memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2503,10 @@ void __audit_fd_pair(int fd1, int fd2)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
 {
        struct audit_context *context = current->audit_context;
 
-       if (likely(!context || context->dummy))
-               return 0;
-
        if (!context->sockaddr) {
                void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
                if (!p)
@@ -2499,6 +2668,25 @@ void __audit_mmap_fd(int fd, int flags)
        context->type = AUDIT_MMAP;
 }
 
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+       uid_t auid, uid;
+       gid_t gid;
+       unsigned int sessionid;
+
+       auid = audit_get_loginuid(current);
+       sessionid = audit_get_sessionid(current);
+       current_uid_gid(&uid, &gid);
+
+       audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+                        auid, uid, gid, sessionid);
+       audit_log_task_context(ab);
+       audit_log_format(ab, " pid=%d comm=", current->pid);
+       audit_log_untrustedstring(ab, current->comm);
+       audit_log_format(ab, " reason=");
+       audit_log_string(ab, reason);
+       audit_log_format(ab, " sig=%ld", signr);
+}
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
@@ -2509,10 +2697,6 @@ void __audit_mmap_fd(int fd, int flags)
 void audit_core_dumps(long signr)
 {
        struct audit_buffer *ab;
-       u32 sid;
-       uid_t auid = audit_get_loginuid(current), uid;
-       gid_t gid;
-       unsigned int sessionid = audit_get_sessionid(current);
 
        if (!audit_enabled)
                return;
@@ -2521,24 +2705,17 @@ void audit_core_dumps(long signr)
                return;
 
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-       current_uid_gid(&uid, &gid);
-       audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-                        auid, uid, gid, sessionid);
-       security_task_getsecid(current, &sid);
-       if (sid) {
-               char *ctx = NULL;
-               u32 len;
+       audit_log_abend(ab, "memory violation", signr);
+       audit_log_end(ab);
+}
 
-               if (security_secid_to_secctx(sid, &ctx, &len))
-                       audit_log_format(ab, " ssid=%u", sid);
-               else {
-                       audit_log_format(ab, " subj=%s", ctx);
-                       security_release_secctx(ctx, len);
-               }
-       }
-       audit_log_format(ab, " pid=%d comm=", current->pid);
-       audit_log_untrustedstring(ab, current->comm);
-       audit_log_format(ab, " sig=%ld", signr);
+void __audit_seccomp(unsigned long syscall)
+{
+       struct audit_buffer *ab;
+
+       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+       audit_log_abend(ab, "seccomp", SIGKILL);
+       audit_log_format(ab, " syscall=%ld", syscall);
        audit_log_end(ab);
 }
 
index 0fcf1c1..3f1adb6 100644 (file)
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
                BUG();
        }
 
-       if (has_ns_capability(current, ns, cap)) {
+       if (security_capable(current_cred(), ns, cap) == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
index c447382..294b170 100644 (file)
@@ -964,8 +964,7 @@ void do_exit(long code)
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
-       if (unlikely(tsk->audit_context))
-               audit_free(tsk);
+       audit_free(tsk);
 
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
index f3fa188..051f090 100644 (file)
@@ -1527,8 +1527,6 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
 
-               audit_finish_fork(p);
-
                /*
                 * We set PF_STARTING at creation in case tracing wants to
                 * use this to distinguish a fully live task from one that
index 57d4b13..e8d76c5 100644 (file)
@@ -6,6 +6,7 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
        dump_stack();
 #endif
+       audit_seccomp(this_syscall);
        do_exit(SIGKILL);
 }
 
index c5c5a72..2ad942f 100644 (file)
@@ -56,9 +56,11 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
                audit_log_format(ab, " name=");
                audit_log_untrustedstring(ab, fname);
        }
-       if (inode)
-               audit_log_format(ab, " dev=%s ino=%lu",
-                                inode->i_sb->s_id, inode->i_ino);
+       if (inode) {
+               audit_log_format(ab, " dev=");
+               audit_log_untrustedstring(ab, inode->i_sb->s_id);
+               audit_log_format(ab, " ino=%lu", inode->i_ino);
+       }
        audit_log_format(ab, " res=%d", !result ? 0 : 1);
        audit_log_end(ab);
 }
index 7bd6f13..293b8c4 100644 (file)
@@ -232,13 +232,14 @@ static void dump_common_audit_data(struct audit_buffer *ab,
        case LSM_AUDIT_DATA_PATH: {
                struct inode *inode;
 
-               audit_log_d_path(ab, "path=", &a->u.path);
+               audit_log_d_path(ab, " path=", &a->u.path);
 
                inode = a->u.path.dentry->d_inode;
-               if (inode)
-                       audit_log_format(ab, " dev=%s ino=%lu",
-                                       inode->i_sb->s_id,
-                                       inode->i_ino);
+               if (inode) {
+                       audit_log_format(ab, " dev=");
+                       audit_log_untrustedstring(ab, inode->i_sb->s_id);
+                       audit_log_format(ab, " ino=%lu", inode->i_ino);
+               }
                break;
        }
        case LSM_AUDIT_DATA_DENTRY: {
@@ -248,10 +249,11 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                audit_log_untrustedstring(ab, a->u.dentry->d_name.name);
 
                inode = a->u.dentry->d_inode;
-               if (inode)
-                       audit_log_format(ab, " dev=%s ino=%lu",
-                                       inode->i_sb->s_id,
-                                       inode->i_ino);
+               if (inode) {
+                       audit_log_format(ab, " dev=");
+                       audit_log_untrustedstring(ab, inode->i_sb->s_id);
+                       audit_log_format(ab, " ino=%lu", inode->i_ino);
+               }
                break;
        }
        case LSM_AUDIT_DATA_INODE: {
@@ -266,8 +268,9 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                                         dentry->d_name.name);
                        dput(dentry);
                }
-               audit_log_format(ab, " dev=%s ino=%lu", inode->i_sb->s_id,
-                                inode->i_ino);
+               audit_log_format(ab, " dev=");
+               audit_log_untrustedstring(ab, inode->i_sb->s_id);
+               audit_log_format(ab, " ino=%lu", inode->i_ino);
                break;
        }
        case LSM_AUDIT_DATA_TASK:
@@ -315,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                                                .dentry = u->dentry,
                                                .mnt = u->mnt
                                        };
-                                       audit_log_d_path(ab, "path=", &path);
+                                       audit_log_d_path(ab, " path=", &path);
                                        break;
                                }
                                if (!u->addr)
index ad40938..b413ed0 100644 (file)
@@ -12,6 +12,9 @@ config SND_HWDEP
 config SND_RAWMIDI
        tristate
 
+config SND_COMPRESS_OFFLOAD
+       tristate
+
 # To be effective this also requires INPUT - users should say:
 #    select SND_JACK if INPUT=y || INPUT=SND
 # to avoid having to force INPUT on.
@@ -154,16 +157,6 @@ config SND_DYNAMIC_MINORS
 
          If you are unsure about this, say N here.
 
-config SND_COMPRESS_OFFLOAD
-       tristate "ALSA Compressed audio offload support"
-       default n
-       help
-         If you want support for offloading compressed audio and have such
-         a hardware, then you should say Y here and also to the DSP driver
-         of your platform.
-
-         If you are unsure about this, say N here.
-
 config SND_SUPPORT_OLD_API
        bool "Support old ALSA API"
        default y
index 762bb10..f13ad53 100644 (file)
@@ -268,8 +268,14 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
                card->shortname, chip->io, chip->irq);
 
        // (4) Alloc components.
+       err = snd_vortex_mixer(chip);
+       if (err < 0) {
+               snd_card_free(card);
+               return err;
+       }
        // ADB pcm.
-       if ((err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_ADB)) < 0) {
+       err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_PCM);
+       if (err < 0) {
                snd_card_free(card);
                return err;
        }
@@ -299,11 +305,6 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
                return err;
        }
 #endif
-       // snd_ac97_mixer and Vortex mixer.
-       if ((err = snd_vortex_mixer(chip)) < 0) {
-               snd_card_free(card);
-               return err;
-       }
        if ((err = snd_vortex_midi(chip)) < 0) {
                snd_card_free(card);
                return err;
index 02f6e08..bb93815 100644 (file)
 #define MIX_SPDIF(x) (vortex->mixspdif[x])
 
 #define NR_WTPB 0x20           /* WT channels per each bank. */
+#define NR_PCM 0x10
 
 /* Structs */
 typedef struct {
index 0488633..0ef2f97 100644 (file)
@@ -168,6 +168,7 @@ static int snd_vortex_pcm_open(struct snd_pcm_substream *substream)
                        runtime->hw = snd_vortex_playback_hw_adb;
 #ifdef CHIP_AU8830
                if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
+                       VORTEX_IS_QUAD(vortex) &&
                        VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) {
                        runtime->hw.channels_max = 4;
                        snd_pcm_hw_constraint_list(runtime, 0,
index 0852e20..fb35474 100644 (file)
@@ -2498,6 +2498,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = {
        SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB),
+       SND_PCI_QUIRK(0x10de, 0xcb89, "Macbook Pro 7,1", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB),
index 87e684f..3556408 100644 (file)
@@ -1596,7 +1596,7 @@ static const struct snd_pci_quirk stac92hd73xx_cfg_tbl[] = {
        SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd,
                                "Dell Studio 1557", STAC_DELL_M6_DMIC),
        SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe,
-                               "Dell Studio XPS 1645", STAC_DELL_M6_BOTH),
+                               "Dell Studio XPS 1645", STAC_DELL_M6_DMIC),
        SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413,
                                "Dell Studio 1558", STAC_DELL_M6_DMIC),
        {} /* terminator */
index 478303e..63cff90 100644 (file)
@@ -177,6 +177,7 @@ static void wm8776_registers_init(struct oxygen *chip)
        struct xonar_wm87x6 *data = chip->model_data;
 
        wm8776_write(chip, WM8776_RESET, 0);
+       wm8776_write(chip, WM8776_PHASESWAP, WM8776_PH_MASK);
        wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN |
                     WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT);
        wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0);