From: Grazvydas Ignotas Date: Thu, 23 May 2013 21:42:23 +0000 (+0300) Subject: Merge branch 'stable-3.2' into pandora-3.2 X-Git-Tag: sz_155~29 X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=commitdiff_plain;h=8fa23b093b442c8fc5fc8d0090eeb7b120d97a6a;hp=-c Merge branch 'stable-3.2' into pandora-3.2 --- 8fa23b093b442c8fc5fc8d0090eeb7b120d97a6a diff --combined Documentation/kernel-parameters.txt index 45182ac0d42a,897f223dcf3e..74f6fdd3669e --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@@ -503,11 -503,6 +503,11 @@@ bytes respectively. Such letter suffixe Also note the kernel might malfunction if you disable some critical bits. + cma=nn[MG] [ARM,KNL] + Sets the size of kernel global memory area for contiguous + memory allocations. For more information, see + include/linux/dma-contiguous.h + cmo_free_hint= [PPC] Format: { yes | no } Specify whether pages are marked as being inactive when they are freed. This is used in CMO environments @@@ -515,10 -510,6 +515,10 @@@ a hypervisor. Default: yes + coherent_pool=nn[KMG] [ARM,KNL] + Sets the size of memory pool for coherent, atomic dma + allocations if Contiguous Memory Allocator (CMA) is used. + code_bytes [X86] How many bytes of object code to print in an oops report. Range: 0 - 8192 @@@ -561,6 -552,8 +561,8 @@@ UART at the specified I/O port or MMIO address, switching to the matching ttyS device later. The options are the same as for ttyS, above. + hvc Use the hypervisor console device . This is for + both Xen and PowerPC hypervisors. If the device connected to the port is not a TTY but a braille device, prepend "brl," before the device type, for instance @@@ -632,25 -625,6 +634,25 @@@ no_debug_objects [KNL] Disable object debugging + debug_guardpage_minorder= + [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + parameter allows control of the order of pages that will + be intentionally kept free (and hence protected) by the + buddy allocator. Bigger value increase the probability + of catching random memory corruption, but reduce the + amount of memory for normal system use. The maximum + possible value is MAX_ORDER/2. Setting this parameter + to 1 or 2 should be enough to identify most random + memory corruption problems caused by bugs in kernel or + driver code when a CPU writes to (or reads from) a + random memory location. Note that there exists a class + of memory corruptions problems caused by buggy H/W or + F/W or by drivers badly programing DMA (basically when + memory is written at bus level and the CPU MMU is + bypassed) which are not detectable by + CONFIG_DEBUG_PAGEALLOC, hence this option will not help + tracking down these problems. + debugpat [X86] Enable PAT debugging decnet.addr= [HW,NET] @@@ -731,6 -705,7 +733,7 @@@ earlyprintk= [X86,SH,BLACKFIN] earlyprintk=vga + earlyprintk=xen earlyprintk=serial[,ttySn[,baudrate]] earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] @@@ -748,6 -723,8 +751,8 @@@ The VGA output is eventually overwritten by the real console. + The xen output can only be used by Xen PV guests. + ekgdboc= [X86,KGDB] Allow early kernel console debugging ekgdboc=kbd @@@ -971,6 -948,20 +976,20 @@@ i8k.restricted [HW] Allow controlling fans only if SYS_ADMIN capability is set. + i915.invert_brightness= + [DRM] Invert the sense of the variable that is used to + set the brightness of the panel backlight. Normally a + brightness value of 0 indicates backlight switched off, + and the maximum of the brightness value sets the backlight + to maximum brightness. If this parameter is set to 0 + (default) and the machine requires it, or this parameter + is set to 1, a brightness value of 0 sets the backlight + to maximum brightness, and the maximum of the brightness + value switches the backlight off. + -1 -- never invert brightness + 0 -- machine default + 1 -- force brightness inversion + icn= [HW,ISDN] Format: [,[,[,]]] diff --combined arch/arm/mm/alignment.c index caf14dc059e5,a125c4bfa7ef..13e96ab54254 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@@ -749,7 -749,6 +749,6 @@@ do_alignment(unsigned long addr, unsign unsigned long instr = 0, instrptr; int (*handler)(unsigned long addr, unsigned long instr, struct pt_regs *regs); unsigned int type; - mm_segment_t fs; unsigned int fault; u16 tinstr = 0; int isize = 4; @@@ -760,16 -759,15 +759,15 @@@ instrptr = instruction_pointer(regs); - fs = get_fs(); - set_fs(KERNEL_DS); if (thumb_mode(regs)) { - fault = __get_user(tinstr, (u16 *)(instrptr & ~1)); + u16 *ptr = (u16 *)(instrptr & ~1); + fault = probe_kernel_address(ptr, tinstr); if (!fault) { if (cpu_architecture() >= CPU_ARCH_ARMv7 && IS_T32(tinstr)) { /* Thumb-2 32-bit */ u16 tinst2 = 0; - fault = __get_user(tinst2, (u16 *)(instrptr+2)); + fault = probe_kernel_address(ptr + 1, tinst2); instr = (tinstr << 16) | tinst2; thumb2_32b = 1; } else { @@@ -778,8 -776,7 +776,7 @@@ } } } else - fault = __get_user(instr, (u32 *)instrptr); - set_fs(fs); + fault = probe_kernel_address(instrptr, instr); if (fault) { type = TYPE_FAULT; @@@ -968,7 -965,7 +965,7 @@@ static int __init alignment_init(void ai_usermode = safe_usermode(ai_usermode, false); } - hook_fault_code(1, do_alignment, SIGBUS, BUS_ADRALN, + hook_fault_code(FAULT_CODE_ALIGNMENT, do_alignment, SIGBUS, BUS_ADRALN, "alignment exception"); /* diff --combined arch/arm/mm/proc-arm920.S index cb941ae95f66,927a639133ce..aeeb12658603 --- a/arch/arm/mm/proc-arm920.S +++ b/arch/arm/mm/proc-arm920.S @@@ -85,7 -85,6 +85,7 @@@ ENTRY(cpu_arm920_proc_fin * loc: location to jump to for soft reset */ .align 5 + .pushsection .idmap.text, "ax" ENTRY(cpu_arm920_reset) mov ip, #0 mcr p15, 0, ip, c7, c7, 0 @ invalidate I,D caches @@@ -98,8 -97,6 +98,8 @@@ bic ip, ip, #0x1100 @ ...i...s........ mcr p15, 0, ip, c1, c0, 0 @ ctrl register mov pc, r0 +ENDPROC(cpu_arm920_reset) + .popsection /* * cpu_arm920_do_idle() @@@ -383,7 -380,7 +383,7 @@@ ENTRY(cpu_arm920_set_pte_ext /* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */ .globl cpu_arm920_suspend_size .equ cpu_arm920_suspend_size, 4 * 3 - #ifdef CONFIG_PM_SLEEP + #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_arm920_do_suspend) stmfd sp!, {r4 - r6, lr} mrc p15, 0, r4, c13, c0, 0 @ PID diff --combined arch/arm/mm/proc-arm926.S index 820259b81a1f,090f18f1dba4..ee29dc465371 --- a/arch/arm/mm/proc-arm926.S +++ b/arch/arm/mm/proc-arm926.S @@@ -77,7 -77,6 +77,7 @@@ ENTRY(cpu_arm926_proc_fin * loc: location to jump to for soft reset */ .align 5 + .pushsection .idmap.text, "ax" ENTRY(cpu_arm926_reset) mov ip, #0 mcr p15, 0, ip, c7, c7, 0 @ invalidate I,D caches @@@ -90,8 -89,6 +90,8 @@@ bic ip, ip, #0x1100 @ ...i...s........ mcr p15, 0, ip, c1, c0, 0 @ ctrl register mov pc, r0 +ENDPROC(cpu_arm926_reset) + .popsection /* * cpu_arm926_do_idle() @@@ -398,7 -395,7 +398,7 @@@ ENTRY(cpu_arm926_set_pte_ext /* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */ .globl cpu_arm926_suspend_size .equ cpu_arm926_suspend_size, 4 * 3 - #ifdef CONFIG_PM_SLEEP + #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_arm926_do_suspend) stmfd sp!, {r4 - r6, lr} mrc p15, 0, r4, c13, c0, 0 @ PID diff --combined arch/arm/mm/proc-sa1100.S index 3aa0da11fd84,6594aef83269..d92dfd081429 --- a/arch/arm/mm/proc-sa1100.S +++ b/arch/arm/mm/proc-sa1100.S @@@ -70,7 -70,6 +70,7 @@@ ENTRY(cpu_sa1100_proc_fin * loc: location to jump to for soft reset */ .align 5 + .pushsection .idmap.text, "ax" ENTRY(cpu_sa1100_reset) mov ip, #0 mcr p15, 0, ip, c7, c7, 0 @ invalidate I,D caches @@@ -83,8 -82,6 +83,8 @@@ bic ip, ip, #0x1100 @ ...i...s........ mcr p15, 0, ip, c1, c0, 0 @ ctrl register mov pc, r0 +ENDPROC(cpu_sa1100_reset) + .popsection /* * cpu_sa1100_do_idle(type) @@@ -172,7 -169,7 +172,7 @@@ ENTRY(cpu_sa1100_set_pte_ext .globl cpu_sa1100_suspend_size .equ cpu_sa1100_suspend_size, 4 * 3 - #ifdef CONFIG_PM_SLEEP + #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_sa1100_do_suspend) stmfd sp!, {r4 - r6, lr} mrc p15, 0, r4, c3, c0, 0 @ domain ID diff --combined arch/arm/mm/proc-v6.S index 5900cd520e84,8168d99444b0..897486c5d5f4 --- a/arch/arm/mm/proc-v6.S +++ b/arch/arm/mm/proc-v6.S @@@ -55,7 -55,6 +55,7 @@@ ENTRY(cpu_v6_proc_fin * - loc - location to jump to for soft reset */ .align 5 + .pushsection .idmap.text, "ax" ENTRY(cpu_v6_reset) mrc p15, 0, r1, c1, c0, 0 @ ctrl register bic r1, r1, #0x1 @ ...............m @@@ -63,8 -62,6 +63,8 @@@ mov r1, #0 mcr p15, 0, r1, c7, c5, 4 @ ISB mov pc, r0 +ENDPROC(cpu_v6_reset) + .popsection /* * cpu_v6_do_idle() @@@ -132,7 -129,7 +132,7 @@@ ENTRY(cpu_v6_set_pte_ext /* Suspend/resume support: taken from arch/arm/mach-s3c64xx/sleep.S */ .globl cpu_v6_suspend_size .equ cpu_v6_suspend_size, 4 * 6 - #ifdef CONFIG_PM_SLEEP + #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_v6_do_suspend) stmfd sp!, {r4 - r9, lr} mrc p15, 0, r4, c13, c0, 0 @ FCSE/PID diff --combined arch/arm/mm/proc-xsc3.S index b0d57869da2d,5c4969dab571..a2d1e8646efc --- a/arch/arm/mm/proc-xsc3.S +++ b/arch/arm/mm/proc-xsc3.S @@@ -105,7 -105,6 +105,7 @@@ ENTRY(cpu_xsc3_proc_fin * loc: location to jump to for soft reset */ .align 5 + .pushsection .idmap.text, "ax" ENTRY(cpu_xsc3_reset) mov r1, #PSR_F_BIT|PSR_I_BIT|SVC_MODE msr cpsr_c, r1 @ reset CPSR @@@ -120,8 -119,6 +120,8 @@@ @ already containing those two last instructions to survive. mcr p15, 0, ip, c8, c7, 0 @ invalidate I and D TLBs mov pc, r0 +ENDPROC(cpu_xsc3_reset) + .popsection /* * cpu_xsc3_do_idle() @@@ -410,7 -407,7 +410,7 @@@ ENTRY(cpu_xsc3_set_pte_ext .globl cpu_xsc3_suspend_size .equ cpu_xsc3_suspend_size, 4 * 6 - #ifdef CONFIG_PM_SLEEP + #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_xsc3_do_suspend) stmfd sp!, {r4 - r9, lr} mrc p14, 0, r4, c6, c0, 0 @ clock configuration, for turbo mode diff --combined arch/arm/mm/proc-xscale.S index 4ffebaa595ee,b09d036e15e2..98821530ae70 --- a/arch/arm/mm/proc-xscale.S +++ b/arch/arm/mm/proc-xscale.S @@@ -142,7 -142,6 +142,7 @@@ ENTRY(cpu_xscale_proc_fin * Beware PXA270 erratum E7. */ .align 5 + .pushsection .idmap.text, "ax" ENTRY(cpu_xscale_reset) mov r1, #PSR_F_BIT|PSR_I_BIT|SVC_MODE msr cpsr_c, r1 @ reset CPSR @@@ -161,8 -160,6 +161,8 @@@ @ already containing those two last instructions to survive. mcr p15, 0, ip, c8, c7, 0 @ invalidate I & D TLBs mov pc, r0 +ENDPROC(cpu_xscale_reset) + .popsection /* * cpu_xscale_do_idle() @@@ -524,7 -521,7 +524,7 @@@ ENTRY(cpu_xscale_set_pte_ext .globl cpu_xscale_suspend_size .equ cpu_xscale_suspend_size, 4 * 6 - #ifdef CONFIG_PM_SLEEP + #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_xscale_do_suspend) stmfd sp!, {r4 - r9, lr} mrc p14, 0, r4, c6, c0, 0 @ clock configuration, for turbo mode diff --combined arch/x86/Kconfig index 98ddd098e5b5,9a4270315dac..2b1b88ee420a --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -73,7 -73,6 +73,7 @@@ config X8 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) + select HAVE_ARCH_TRANSPARENT_HUGEPAGE select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG @@@ -1151,7 -1150,7 +1151,7 @@@ config DIRECT_GBPAGE config NUMA bool "Numa Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && BROKEN) default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) ---help--- Enable NUMA (Non Uniform Memory Access) support. diff --combined fs/cifs/cifsfs.c index 833345324418,25bb97fc0d54..a5c996f0c43b --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@@ -54,7 -54,6 +54,7 @@@ int cifsFYI = 0 int cifsERROR = 1; int traceSMB = 0; bool enable_oplocks = true; +bool no_serverino_autodisable = false; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; @@@ -86,12 -85,35 +86,36 @@@ MODULE_PARM_DESC(echo_retries, "Number module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" "y/Y/1"); +module_param(no_serverino_autodisable, bool, 0644); extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; + /* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ + void + cifs_sb_active(struct super_block *sb) + { + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); + } + + void + cifs_sb_deactive(struct super_block *sb) + { + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); + } + static int cifs_read_super(struct super_block *sb) { @@@ -563,6 -585,11 +587,11 @@@ cifs_get_root(struct smb_vol *vol, stru dentry = ERR_PTR(-ENOENT); break; } + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } /* skip separators */ while (*s == sep) diff --combined fs/exec.c index d4ff5ed2359c,312e297c4871..3aa5c56c3954 --- a/fs/exec.c +++ b/fs/exec.c @@@ -1385,6 -1385,10 +1385,10 @@@ int search_binary_handler(struct linux_ struct linux_binfmt *fmt; pid_t old_pid; + /* This allows 4 levels of binfmt rewrites before failing hard. */ + if (depth > 5) + return -ELOOP; + retval = security_bprm_check(bprm); if (retval) return retval; @@@ -1408,12 -1412,8 +1412,8 @@@ if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); + bprm->recursion_depth = depth + 1; retval = fn(bprm, regs); - /* - * Restore the depth counter to its starting value - * in this call, so we don't have to rely on every - * load_binary function to restore it on return. - */ bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) @@@ -2092,8 -2092,8 +2092,8 @@@ static int umh_pipe_setup(struct subpro fd_install(0, rp); spin_lock(&cf->file_lock); fdt = files_fdtable(cf); - FD_SET(0, fdt->open_fds); - FD_CLR(0, fdt->close_on_exec); + __set_open_fd(0, fdt); + __clear_close_on_exec(0, fdt); spin_unlock(&cf->file_lock); /* and disallow core files too */ diff --combined fs/inode.c index 728042bff2f2,e2d3633a9c05..d62cb51261bf --- a/fs/inode.c +++ b/fs/inode.c @@@ -65,7 -65,6 +65,7 @@@ static struct hlist_head *inode_hashtab static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +EXPORT_SYMBOL(inode_sb_list_lock); /* * Empty aops. Can be used for the cases where the user does not @@@ -635,7 -634,7 +635,7 @@@ void prune_icache_sb(struct super_bloc * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move_tail(&inode->i_lru, &sb->s_inode_lru); + list_move(&inode->i_lru, &sb->s_inode_lru); continue; } diff --combined fs/ubifs/orphan.c index f3b1f9493e86,f9c90b552452..574d7a0a1eb7 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@@ -52,7 -52,11 +52,7 @@@ * than the maximum number of orphans allowed. */ -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_orphans(struct ubifs_info *c); -#else -#define dbg_check_orphans(c) 0 -#endif /** * ubifs_add_orphan - add an orphan. @@@ -88,7 -92,7 +88,7 @@@ int ubifs_add_orphan(struct ubifs_info else if (inum > o->inum) p = &(*p)->rb_right; else { - dbg_err("orphaned twice"); + ubifs_err("orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); return 0; @@@ -126,13 -130,14 +126,14 @@@ void ubifs_delete_orphan(struct ubifs_i else if (inum > o->inum) p = p->rb_right; else { - if (o->dnext) { + if (o->del) { spin_unlock(&c->orphan_lock); dbg_gen("deleted twice ino %lu", (unsigned long)inum); return; } if (o->cnext) { + o->del = 1; o->dnext = c->orph_dnext; c->orph_dnext = o; spin_unlock(&c->orphan_lock); @@@ -154,8 -159,8 +155,8 @@@ } } spin_unlock(&c->orphan_lock); - dbg_err("missing orphan ino %lu", (unsigned long)inum); - dbg_dump_stack(); + ubifs_err("missing orphan ino %lu", (unsigned long)inum); + dump_stack(); } /** @@@ -244,7 -249,8 +245,7 @@@ static int do_write_orph_node(struct ub ubifs_assert(c->ohead_offs == 0); ubifs_prepare_node(c, c->orph_buf, len, 1); len = ALIGN(len, c->min_io_size); - err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len); } else { if (c->ohead_offs == 0) { /* Ensure LEB has been unmapped */ @@@ -253,7 -259,7 +254,7 @@@ return err; } err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, - c->ohead_offs, UBI_SHORTTERM); + c->ohead_offs); } return err; } @@@ -442,6 -448,7 +443,7 @@@ static void erase_deleted(struct ubifs_ orphan = dnext; dnext = orphan->dnext; ubifs_assert(!orphan->new); + ubifs_assert(orphan->del); rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; @@@ -531,6 -538,7 +533,7 @@@ static int insert_dead_orphan(struct ub rb_link_node(&orphan->rb, parent, p); rb_insert_color(&orphan->rb, &c->orph_tree); list_add_tail(&orphan->list, &c->orph_list); + orphan->del = 1; orphan->dnext = c->orph_dnext; c->orph_dnext = orphan; dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, @@@ -562,9 -570,9 +565,9 @@@ static int do_kill_orphans(struct ubifs list_for_each_entry(snod, &sleb->nodes, list) { if (snod->type != UBIFS_ORPH_NODE) { - ubifs_err("invalid node type %d in orphan area at " - "%d:%d", snod->type, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_err("invalid node type %d in orphan area at %d:%d", + snod->type, sleb->lnum, snod->offs); + ubifs_dump_node(c, snod->node); return -EINVAL; } @@@ -589,9 -597,10 +592,9 @@@ * number. That makes this orphan node, out of date. */ if (!first) { - ubifs_err("out of order commit number %llu in " - "orphan node at %d:%d", + ubifs_err("out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); return -EINVAL; } dbg_rcvry("out of date LEB %d", sleb->lnum); @@@ -719,9 -728,7 +722,9 @@@ int ubifs_mount_orphans(struct ubifs_in return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ struct check_orphan { struct rb_node rb; @@@ -964,3 -971,5 +967,3 @@@ out kfree(ci.node); return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --combined fs/ubifs/super.c index 6b9b96620a02,2f467e56cf95..3a22e6d1bc90 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@@ -89,8 -89,9 +89,8 @@@ static int validate_inode(struct ubifs_ return 5; if (!ubifs_compr_present(ui->compr_type)) { - ubifs_warn("inode %lu uses '%s' compression, but it was not " - "compiled in", inode->i_ino, - ubifs_compr_name(ui->compr_type)); + ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in", + inode->i_ino, ubifs_compr_name(ui->compr_type)); } err = dbg_check_dir(c, inode); @@@ -245,8 -246,8 +245,8 @@@ struct inode *ubifs_iget(struct super_b out_invalid: ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); - dbg_dump_node(c, ino); - dbg_dump_inode(c, inode); + ubifs_dump_node(c, ino); + ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); @@@ -668,8 -669,8 +668,8 @@@ static int init_constants_sb(struct ubi tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { - dbg_err("too small LEB size %d, at least %d needed", - c->leb_size, tmp); + ubifs_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); return -EINVAL; } @@@ -683,8 -684,8 +683,8 @@@ tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { - dbg_err("too small log %d LEBs, required min. %d LEBs", - c->log_lebs, tmp); + ubifs_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); return -EINVAL; } @@@ -813,10 -814,13 +813,10 @@@ static int alloc_wbufs(struct ubifs_inf c->jheads[i].grouped = 1; } - c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; /* - * Garbage Collector head likely contains long-term data and - * does not need to be synchronized by timer. Also GC head nodes are - * not grouped. + * Garbage Collector head does not need to be synchronized by timer. + * Also GC head nodes are not grouped. */ - c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; c->jheads[GCHD].wbuf.no_timer = 1; c->jheads[GCHD].grouped = 0; @@@ -860,7 -864,7 +860,7 @@@ static void free_orphans(struct ubifs_i orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); - dbg_err("orphan list not empty at unmount"); + ubifs_err("orphan list not empty at unmount"); } vfree(c->orph_buf); @@@ -1061,8 -1065,8 +1061,8 @@@ static int ubifs_parse_options(struct u flag = parse_standard_option(p); if (!flag) { - ubifs_err("unrecognized mount option \"%s\" " - "or missing value", p); + ubifs_err("unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } sb->s_flags |= flag; @@@ -1124,8 -1128,8 +1124,8 @@@ again } /* Just disable bulk-read */ - ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, " - "disabling it", c->max_bu_buf_len); + ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it", + c->max_bu_buf_len); c->mount_opts.bulk_read = 1; c->bulk_read = 0; return; @@@ -1144,8 -1148,8 +1144,8 @@@ static int check_free_space(struct ubif ubifs_assert(c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { ubifs_err("insufficient free space to mount in R/W mode"); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); return -ENOSPC; } return 0; @@@ -1164,7 -1168,7 +1164,7 @@@ static int mount_ubifs(struct ubifs_info *c) { int err; - long long x; + long long x, y; size_t sz; c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); @@@ -1298,7 -1302,7 +1298,7 @@@ if (!c->ro_mount && c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - goto out_master; + goto out_lpt; } if (!c->ro_mount) { @@@ -1414,69 -1418,75 +1414,69 @@@ c->mounting = 0; - ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", - c->vi.ubi_num, c->vi.vol_id, c->vi.name); - if (c->ro_mount) - ubifs_msg("mounted read-only"); + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", + c->vi.ubi_num, c->vi.vol_id, c->vi.name, + c->ro_mount ? ", R/O mode" : NULL); x = (long long)c->main_lebs * c->leb_size; - ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " - "LEBs)", x, x >> 10, x >> 20, c->main_lebs); - x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " - "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", + c->leb_size, c->leb_size >> 10, c->min_io_size, + c->max_write_size); + ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", + x, x >> 20, c->main_lebs, + y, y >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", c->fmt_version, c->ro_compat_version, - UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); - ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); - ubifs_msg("reserved for root: %llu bytes (%llu KiB)", - c->report_rp_size, c->report_rp_size >> 10); - - dbg_msg("compiled on: " __DATE__ " at " __TIME__); - dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); - dbg_msg("max. write size: %d bytes", c->max_write_size); - dbg_msg("LEB size: %d bytes (%d KiB)", - c->leb_size, c->leb_size >> 10); - dbg_msg("data journal heads: %d", + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, + c->big_lpt ? ", big LPT model" : ", small LPT model"); + + dbg_gen("default compressor: %s", ubifs_compr_name(c->default_compr)); + dbg_gen("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); - dbg_msg("UUID: %pUB", c->uuid); - dbg_msg("big_lpt %d", c->big_lpt); - dbg_msg("log LEBs: %d (%d - %d)", + dbg_gen("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); - dbg_msg("LPT area LEBs: %d (%d - %d)", + dbg_gen("LPT area LEBs: %d (%d - %d)", c->lpt_lebs, c->lpt_first, c->lpt_last); - dbg_msg("orphan area LEBs: %d (%d - %d)", + dbg_gen("orphan area LEBs: %d (%d - %d)", c->orph_lebs, c->orph_first, c->orph_last); - dbg_msg("main area LEBs: %d (%d - %d)", + dbg_gen("main area LEBs: %d (%d - %d)", c->main_lebs, c->main_first, c->leb_cnt - 1); - dbg_msg("index LEBs: %d", c->lst.idx_lebs); - dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("index LEBs: %d", c->lst.idx_lebs); + dbg_gen("total index bytes: %lld (%lld KiB, %lld MiB)", c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, c->bi.old_idx_sz >> 20); - dbg_msg("key hash type: %d", c->key_hash_type); - dbg_msg("tree fanout: %d", c->fanout); - dbg_msg("reserved GC LEB: %d", c->gc_lnum); - dbg_msg("first main LEB: %d", c->main_first); - dbg_msg("max. znode size %d", c->max_znode_sz); - dbg_msg("max. index node size %d", c->max_idx_node_sz); - dbg_msg("node sizes: data %zu, inode %zu, dentry %zu", + dbg_gen("key hash type: %d", c->key_hash_type); + dbg_gen("tree fanout: %d", c->fanout); + dbg_gen("reserved GC LEB: %d", c->gc_lnum); + dbg_gen("max. znode size %d", c->max_znode_sz); + dbg_gen("max. index node size %d", c->max_idx_node_sz); + dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); - dbg_msg("node sizes: trun %zu, sb %zu, master %zu", + dbg_gen("node sizes: trun %zu, sb %zu, master %zu", UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); - dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", + dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); - dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", + dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); - dbg_msg("dead watermark: %d", c->dead_wm); - dbg_msg("dark watermark: %d", c->dark_wm); - dbg_msg("LEB overhead: %d", c->leb_overhead); + dbg_gen("dead watermark: %d", c->dead_wm); + dbg_gen("dark watermark: %d", c->dark_wm); + dbg_gen("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; - dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", + dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); - dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", c->max_bud_bytes, c->max_bud_bytes >> 10, c->max_bud_bytes >> 20); - dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", c->bg_bud_bytes, c->bg_bud_bytes >> 10, c->bg_bud_bytes >> 20); - dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", + dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); - dbg_msg("max. seq. number: %llu", c->max_sqnum); - dbg_msg("commit number: %llu", c->cmt_no); + dbg_gen("max. seq. number: %llu", c->max_sqnum); + dbg_gen("commit number: %llu", c->cmt_no); return 0; @@@ -1561,9 -1571,10 +1561,9 @@@ static int ubifs_remount_rw(struct ubif if (c->rw_incompat) { ubifs_err("the file-system is not R/W-compatible"); - ubifs_msg("on-flash format version is w%d/r%d, but software " - "only supports up to version w%d/r%d", c->fmt_version, - c->ro_compat_version, UBIFS_FORMAT_VERSION, - UBIFS_RO_COMPAT_VERSION); + ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); return -EROFS; } @@@ -1572,6 -1583,12 +1572,12 @@@ c->remounting_rw = 1; c->ro_mount = 0; + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + return err; + } + err = check_free_space(c); if (err) goto out; @@@ -1688,12 -1705,6 +1694,6 @@@ err = dbg_check_space_info(c); } - if (c->space_fixup) { - err = ubifs_fixup_free_space(c); - if (err) - goto out; - } - mutex_unlock(&c->umount_mutex); return err; @@@ -1824,8 -1835,8 +1824,8 @@@ static void ubifs_put_super(struct supe * next mount, so we just print a message and * continue to unmount normally. */ - ubifs_err("failed to write master node, " - "error %d", err); + ubifs_err("failed to write master node, error %d", + err); } else { for (i = 0; i < c->jhead_cnt; i++) /* Make sure write-buffer timers are canceled */ @@@ -2118,8 -2129,8 +2118,8 @@@ static struct dentry *ubifs_mount(struc */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - dbg_err("cannot open \"%s\", error %d", - name, (int)PTR_ERR(ubi)); + ubifs_err("cannot open \"%s\", error %d", + name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } @@@ -2247,7 -2258,8 +2247,7 @@@ static int __init ubifs_init(void * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { - ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" - " at least 4096 bytes", + ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", (unsigned int)PAGE_CACHE_SIZE); return -EINVAL; } diff --combined fs/ubifs/ubifs.h index 8e3596c08cea,a39fce5c107e..fad0807b8b2a --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@@ -42,15 -42,16 +42,15 @@@ #define UBIFS_VERSION 1 /* Normal UBIFS messages */ -#define ubifs_msg(fmt, ...) \ - printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__) +#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__) /* UBIFS error messages */ -#define ubifs_err(fmt, ...) \ - printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ +#define ubifs_err(fmt, ...) \ + pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ __func__, ##__VA_ARGS__) /* UBIFS warning messages */ -#define ubifs_warn(fmt, ...) \ - printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) +#define ubifs_warn(fmt, ...) \ + pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) /* UBIFS file system VFS magic number */ #define UBIFS_SUPER_MAGIC 0x24051905 @@@ -83,6 -84,9 +83,6 @@@ #define INUM_WARN_WATERMARK 0xFFF00000 #define INUM_WATERMARK 0xFFFFFF00 -/* Largest key size supported in this implementation */ -#define CUR_MAX_KEY_LEN UBIFS_SK_LEN - /* Maximum number of entries in each LPT (LEB category) heap */ #define LPT_HEAP_SZ 256 @@@ -273,10 -277,10 +273,10 @@@ struct ubifs_old_idx /* The below union makes it easier to deal with keys */ union ubifs_key { - uint8_t u8[CUR_MAX_KEY_LEN]; - uint32_t u32[CUR_MAX_KEY_LEN/4]; - uint64_t u64[CUR_MAX_KEY_LEN/8]; - __le32 j32[CUR_MAX_KEY_LEN/4]; + uint8_t u8[UBIFS_SK_LEN]; + uint32_t u32[UBIFS_SK_LEN/4]; + uint64_t u64[UBIFS_SK_LEN/8]; + __le32 j32[UBIFS_SK_LEN/4]; }; /** @@@ -649,6 -653,8 +649,6 @@@ typedef int (*ubifs_lpt_scan_callback)( * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) - * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, - * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep * up by 'mutex_lock_nested()). * @sync_callback: write-buffer synchronization callback @@@ -682,6 -688,7 +682,6 @@@ struct ubifs_wbuf int avail; int used; int size; - int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; @@@ -758,9 -765,6 +758,9 @@@ struct ubifs_zbranch * @offs: offset of the corresponding indexing node * @len: length of the corresponding indexing node * @zbranch: array of znode branches (@c->fanout elements) + * + * Note! The @lnum, @offs, and @len fields are not really needed - we have them + * only for internal consistency check. They could be removed to save some RAM. */ struct ubifs_znode { struct ubifs_znode *parent; @@@ -771,9 -775,9 +771,9 @@@ int child_cnt; int iip; int alt; -#ifdef CONFIG_UBIFS_FS_DEBUG - int lnum, offs, len; -#endif + int lnum; + int offs; + int len; struct ubifs_zbranch zbranch[]; }; @@@ -904,6 -908,7 +904,7 @@@ struct ubifs_budget_req * @dnext: next orphan to delete * @inum: inode number * @new: %1 => added since the last commit, otherwise %0 + * @del: %1 => delete pending, otherwise %0 */ struct ubifs_orphan { struct rb_node rb; @@@ -913,6 -918,7 +914,7 @@@ struct ubifs_orphan *dnext; ino_t inum; int new; + unsigned del:1; }; /** @@@ -1446,7 -1452,9 +1448,7 @@@ struct ubifs_info struct rb_root size_tree; struct ubifs_mount_opts mount_opts; -#ifdef CONFIG_UBIFS_FS_DEBUG struct ubifs_debug_info *dbg; -#endif }; extern struct list_head ubifs_infos; @@@ -1468,20 -1476,22 +1470,20 @@@ void ubifs_ro_mode(struct ubifs_info *c int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, int len, int even_ebadmsg); int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype); -int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype); + int len); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); int ubifs_leb_unmap(struct ubifs_info *c, int lnum); -int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype); +int ubifs_leb_map(struct ubifs_info *c, int lnum); int ubifs_is_mapped(const struct ubifs_info *c, int lnum); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs); int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs); int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int lnum, int offs); int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, - int offs, int dtype); + int offs); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); diff --combined include/linux/mm.h index 01f614b2fbc7,d0493f6064f3..e5f83b1602f3 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@@ -1509,6 -1509,8 +1509,8 @@@ int vm_insert_pfn(struct vm_area_struc unsigned long pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); + int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); + struct page *follow_page(struct vm_area_struct *, unsigned long address, unsigned int foll_flags); @@@ -1628,22 -1630,5 +1630,22 @@@ extern void copy_user_huge_page(struct unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +#ifdef CONFIG_DEBUG_PAGEALLOC +extern unsigned int _debug_guardpage_minorder; + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool page_is_guard(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool page_is_guard(struct page *page) { return false; } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --combined include/linux/writeback.h index 34a005515fef,7e85d454c163..7570cdb66e3c --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@@ -138,6 -138,8 +138,6 @@@ extern int vm_highmem_is_dirtyable extern int block_dump; extern int laptop_mode; -extern unsigned long determine_dirtyable_memory(void); - extern int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@@ -193,6 -195,8 +193,8 @@@ void writeback_set_ratelimit(void) void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); + void account_page_redirty(struct page *page); + /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl read-only. */ diff --combined kernel/sched.c index 525ad5204d21,d08c9f4b9fce..6ab532c453a8 --- a/kernel/sched.c +++ b/kernel/sched.c @@@ -2889,8 -2889,10 +2889,10 @@@ static void try_to_wake_up_local(struc { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { @@@ -5294,7 -5296,6 +5296,7 @@@ int can_nice(const struct task_struct * return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); } +EXPORT_SYMBOL_GPL(can_nice); #ifdef __ARCH_WANT_SYS_NICE diff --combined kernel/signal.c index dc870f47b444,3ecf57489376..6c4cc9433ee4 --- a/kernel/signal.c +++ b/kernel/signal.c @@@ -481,6 -481,9 +481,9 @@@ flush_signal_handlers(struct task_struc if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; + #ifdef __ARCH_HAS_SA_RESTORER + ka->sa.sa_restorer = NULL; + #endif sigemptyset(&ka->sa.sa_mask); ka++; } @@@ -1267,7 -1270,6 +1270,7 @@@ struct sighand_struct *__lock_task_sigh return sighand; } +EXPORT_SYMBOL_GPL(__lock_task_sighand); /* * send signal info to all the members of a group @@@ -2788,7 -2790,7 +2791,7 @@@ do_send_specific(pid_t tgid, pid_t pid static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info; + struct siginfo info = {}; info.si_signo = sig; info.si_errno = 0; diff --combined kernel/trace/trace.c index 53c3b3290e8b,0ec6c349bbc9..0c99b156cb5a --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@@ -652,7 -652,7 +652,7 @@@ __update_max_tr(struct trace_array *tr void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; if (trace_stop_count) return; @@@ -664,6 -664,7 +664,7 @@@ } arch_spin_lock(&ftrace_max_lock); + buf = tr->buffer; tr->buffer = max_tr.buffer; max_tr.buffer = buf; @@@ -2635,11 -2636,25 +2636,25 @@@ static int set_tracer_option(struct tra return -EINVAL; } - static void set_tracer_flags(unsigned int mask, int enabled) + /* Some tracers require overwrite to stay enabled */ + int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) + { + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; + } + + int set_tracer_flag(unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (current_trace->flag_changed) + if (current_trace->flag_changed(current_trace, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@@ -2649,8 -2664,14 +2664,14 @@@ if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(global_trace.buffer, enabled); + #ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(max_tr.buffer, enabled); + #endif + } + + return 0; } static ssize_t @@@ -2660,7 -2681,7 +2681,7 @@@ tracing_trace_options_write(struct fil char buf[64]; char *cmp; int neg = 0; - int ret; + int ret = 0; int i; if (cnt >= sizeof(buf)) @@@ -2677,21 -2698,23 +2698,23 @@@ cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(1 << i, !neg); break; } } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); + if (!trace_options[i]) ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - } + + mutex_unlock(&trace_types_lock); + + if (ret) + return ret; *ppos += cnt; @@@ -3015,6 -3038,9 +3038,9 @@@ static int tracing_set_tracer(const cha goto out; trace_branch_disable(); + + current_trace->enabled = false; + if (current_trace && current_trace->reset) current_trace->reset(tr); if (current_trace && current_trace->use_max_tr) { @@@ -3044,6 -3070,7 +3070,7 @@@ goto out; } + current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@@ -4378,7 -4405,13 +4405,13 @@@ trace_options_core_write(struct file *f if (val != 0 && val != 1) return -EINVAL; - set_tracer_flags(1 << index, val); + + mutex_lock(&trace_types_lock); + ret = set_tracer_flag(1 << index, val); + mutex_unlock(&trace_types_lock); + + if (ret < 0) + return ret; *ppos += cnt; @@@ -4393,7 -4426,7 +4426,7 @@@ static const struct file_operations tra }; struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) @@@ -4530,6 -4563,8 +4563,8 @@@ static __init int tracer_init_debugfs(v trace_access_lock_init(); d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; trace_create_file("tracing_enabled", 0644, d_tracer, &global_trace, &tracing_ctrl_fops); @@@ -4663,36 -4698,32 +4698,32 @@@ void trace_init_global_iter(struct trac iter->cpu_file = TRACE_PIPE_ALL_CPU; } - static void - __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) + void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static atomic_t dump_running; unsigned int old_userobj; - static int dump_ran; unsigned long flags; int cnt = 0, cpu; - /* only one dump */ - local_irq_save(flags); - arch_spin_lock(&ftrace_dump_lock); - if (dump_ran) - goto out; - - dump_ran = 1; + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ tracing_off(); - /* Did function tracer already get disabled? */ - if (ftrace_is_dead()) { - printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - printk("# MAY BE MISSING FUNCTION EVENTS\n"); - } - - if (disable_tracing) - ftrace_kill(); + local_irq_save(flags); trace_init_global_iter(&iter); @@@ -4725,6 -4756,12 +4756,12 @@@ printk(KERN_TRACE "Dumping ftrace buffer:\n"); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + /* * We need to stop all tracing on all CPUS to read the * the next buffer. This is a bit expensive, but is @@@ -4763,26 -4800,15 +4800,15 @@@ printk(KERN_TRACE "---------------------------------\n"); out_enable: - /* Re-enable tracing if requested */ - if (!disable_tracing) { - trace_flags |= old_userobj; + trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); - } - tracing_on(); + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); } - - out: - arch_spin_unlock(&ftrace_dump_lock); + atomic_dec(&dump_running); local_irq_restore(flags); } - - /* By default: disable tracing after the dump */ - void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) - { - __ftrace_dump(true, oops_dump_mode); - } + EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { diff --combined kernel/trace/trace.h index 0154c0b850de,c3c3f6b398d3..2f2fd4683378 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@@ -271,10 -271,14 +271,14 @@@ struct tracer enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct tracer *tracer, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; int print_max; int use_max_tr; + bool enabled; }; @@@ -312,7 -316,7 +316,7 @@@ void tracing_reset_current(int cpu) void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); @@@ -815,6 -819,9 +819,9 @@@ extern struct list_head ftrace_events extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; + int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); + int set_tracer_flag(unsigned int mask, int enabled); + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct ftrace_event_call \ diff --combined mm/hugetlb.c index bdb5f174ed21,70b473338da0..581d98554637 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@@ -633,7 -633,6 +633,7 @@@ static void free_huge_page(struct page h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); @@@ -1678,9 -1677,9 +1678,9 @@@ static void __init hugetlb_sysfs_init(v /* * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node sysdevs in node_devices[] using a parallel array. The array - * index of a node sysdev or _hstate == node id. - * This is here to avoid any static dependency of the node sysdev driver, in + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { @@@ -1690,7 -1689,7 +1690,7 @@@ struct node_hstate node_hstates[MAX_NUMNODES]; /* - * A subset of global hstate attributes for node sysdevs + * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, @@@ -1704,7 -1703,7 +1704,7 @@@ static struct attribute_group per_node_ }; /* - * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) @@@ -1727,13 -1726,13 +1727,13 @@@ } /* - * Unregister hstate attributes from a single node sysdev. + * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ @@@ -1749,7 -1748,7 +1749,7 @@@ } /* - * hugetlb module exit: unregister hstate attributes from node sysdevs + * hugetlb module exit: unregister hstate attributes from node devices * that have them. */ static void hugetlb_unregister_all_nodes(void) @@@ -1757,7 -1756,7 +1757,7 @@@ int nid; /* - * disable node sysdev registrations. + * disable node device registrations. */ register_hugetlbfs_with_node(NULL, NULL); @@@ -1769,20 -1768,20 +1769,20 @@@ } /* - * Register hstate attributes for a single node sysdev. + * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->sysdev.kobj); + &node->dev.kobj); if (!nhs->hugepages_kobj) return; @@@ -1793,7 -1792,7 +1793,7 @@@ if (err) { printk(KERN_ERR "Hugetlb: Unable to add hstate %s" " for node %d\n", - h->name, node->sysdev.id); + h->name, node->dev.id); hugetlb_unregister_node(node); break; } @@@ -1802,8 -1801,8 +1802,8 @@@ /* * hugetlb init time: register hstate attributes for all registered node - * sysdevs of nodes that have memory. All on-line nodes should have - * registered their associated sysdev by this time. + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. */ static void hugetlb_register_all_nodes(void) { @@@ -1811,12 -1810,12 +1811,12 @@@ for_each_node_state(nid, N_HIGH_MEMORY) { struct node *node = &node_devices[nid]; - if (node->sysdev.id == nid) + if (node->dev.id == nid) hugetlb_register_node(node); } /* - * Let the node sysdev driver know we're here so it can + * Let the node device driver know we're here so it can * [un]register hstate attributes on node hotplug. */ register_hugetlbfs_with_node(hugetlb_register_node, @@@ -2093,8 -2092,12 +2093,12 @@@ int hugetlb_report_node_meminfo(int nid /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) @@@ -2886,7 -2889,17 +2890,17 @@@ int follow_hugetlb_page(struct mm_struc break; } - if (absent || + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; diff --combined mm/memory.c index bf6777555ab8,d5f913b9fae4..f0b43f612073 --- a/mm/memory.c +++ b/mm/memory.c @@@ -1401,7 -1401,6 +1401,7 @@@ unsigned long zap_page_range(struct vm_ tlb_finish_mmu(&tlb, address, end); return end; } +EXPORT_SYMBOL_GPL(zap_page_range); /** * zap_vma_ptes - remove ptes mapping the vma @@@ -2310,6 -2309,53 +2310,53 @@@ int remap_pfn_range(struct vm_area_stru } EXPORT_SYMBOL(remap_pfn_range); + /** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ + int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) + { + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); + } + EXPORT_SYMBOL(vm_iomap_memory); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) @@@ -3493,9 -3539,8 +3540,9 @@@ retry barrier(); if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + if (dirty && !pmd_write(orig_pmd) && !pmd_trans_splitting(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); @@@ -3507,9 -3552,6 +3554,9 @@@ if (unlikely(ret & VM_FAULT_OOM)) goto retry; return ret; + } else { + huge_pmd_set_accessed(mm, vma, address, pmd, + orig_pmd, dirty); } return 0; } diff --combined mm/memory_hotplug.c index 77ad30613a3d,09d87b709179..872794f13c09 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@@ -515,19 -515,20 +515,20 @@@ int __ref online_pages(unsigned long pf zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - if (need_zonelists_rebuild) - build_all_zonelists(zone); - else - zone_pcp_update(zone); + if (onlined_pages) { + node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + if (need_zonelists_rebuild) + build_all_zonelists(zone); + else + zone_pcp_update(zone); + } mutex_unlock(&zonelists_mutex); init_per_zone_wmark_min(); - if (onlined_pages) { + if (onlined_pages) kswapd_run(zone_to_nid(zone)); - node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); - } vm_total_pages = nr_free_pagecache_pages(); @@@ -895,7 -896,7 +896,7 @@@ static int __ref offline_pages(unsigne nr_pages = end_pfn - start_pfn; /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn); + ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); if (ret) goto out; @@@ -960,7 -961,7 +961,7 @@@ repeat We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; @@@ -985,7 -986,7 +986,7 @@@ failed_removal start_pfn, end_pfn); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: unlock_memory_hotplug(); diff --combined mm/page-writeback.c index a4f563889184,ea3f83b21197..8861ad46ac18 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@@ -129,67 -129,6 +129,67 @@@ unsigned long global_dirty_limit */ static struct prop_descriptor vm_completions; +/* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z) - z->dirty_balance_reserve; + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - + dirty_balance_reserve; + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + /* * couple the period to the dirty_ratio: * @@@ -257,6 -196,7 +257,6 @@@ int dirty_ratio_handler(struct ctl_tabl return ret; } - int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@@ -351,6 -291,67 +351,6 @@@ int bdi_set_max_ratio(struct backing_de } EXPORT_SYMBOL(bdi_set_max_ratio); -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@@ -1799,6 -1800,24 +1799,24 @@@ int __set_page_dirty_nobuffers(struct p } EXPORT_SYMBOL(__set_page_dirty_nobuffers); + /* + * Call this whenever redirtying a page, to de-account the dirty counters + * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written + * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to + * systematic errors in balanced_dirty_ratelimit and the dirty pages position + * control. + */ + void account_page_redirty(struct page *page) + { + struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + current->nr_dirtied--; + dec_zone_page_state(page, NR_DIRTIED); + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + } + } + EXPORT_SYMBOL(account_page_redirty); + /* * When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via @@@ -1807,6 -1826,7 +1825,7 @@@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { wbc->pages_skipped++; + account_page_redirty(page); return __set_page_dirty_nobuffers(page); } EXPORT_SYMBOL(redirty_page_for_writepage); diff --combined mm/page_alloc.c index ad8cb2fce163,5c028e2af06a..dceb1be6af34 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@@ -57,8 -57,6 +57,8 @@@ #include #include #include +#include +#include #include #include @@@ -98,14 -96,6 +98,14 @@@ EXPORT_SYMBOL(node_states) unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@@ -137,13 -127,6 +137,13 @@@ void pm_restrict_gfp_mask(void saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~GFP_IOFS; } + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@@ -420,37 -403,6 +420,37 @@@ static inline void prep_zero_page(struc clear_highpage(page + i); } +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@@ -508,11 -460,6 +508,11 @@@ static inline int page_is_buddy(struct if (page_zone_id(page) != page_zone_id(buddy)) return 0; + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); + return 1; + } + if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); return 1; @@@ -536,10 -483,10 +536,10 @@@ * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. + * triggers coalescing into a block of larger size. * * -- wli */ @@@ -569,19 -516,11 +569,19 @@@ static inline void __free_one_page(stru buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; - - /* Our buddy is free, merge with it and move up one order. */ - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard_flag(buddy); + set_page_private(page, 0); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; @@@ -715,7 -654,7 +715,7 @@@ static bool free_pages_prepare(struct p int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@@ -781,24 -720,6 +781,24 @@@ void __meminit __free_pages_bootmem(str } } +#ifdef CONFIG_CMA +/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_page_refcounted(page); + set_pageblock_migratetype(page, MIGRATE_CMA); + __free_pages(page, pageblock_order); + totalram_pages += pageblock_nr_pages; +} +#endif /* * The order of subdivision here is critical for the IO subsystem. @@@ -825,23 -746,6 +825,23 @@@ static inline void expand(struct zone * high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + INIT_LIST_HEAD(&page[size].lru); + set_page_guard_flag(&page[size]); + set_page_private(&page[size], high); + /* Guard pages are not available for any usage */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + continue; + } +#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@@ -924,17 -828,11 +924,17 @@@ struct page *__rmqueue_smallest(struct * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ +#else + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#endif + [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ }; /* @@@ -1029,12 -927,12 +1029,12 @@@ __rmqueue_fallback(struct zone *zone, i /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) { - for (i = 0; i < MIGRATE_TYPES - 1; i++) { + for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) - continue; + break; area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype])) @@@ -1049,18 -947,11 +1049,18 @@@ * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more * aggressive about taking ownership of free pages + * + * On the other hand, never change migration + * type of MIGRATE_CMA pageblocks nor move CMA + * pages on different free lists. We don't + * want unmovable pages to be allocated from + * MIGRATE_CMA areas. */ - if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled) { - unsigned long pages; + if (!is_migrate_cma(migratetype) && + (unlikely(current_order >= pageblock_order / 2) || + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled)) { + int pages; pages = move_freepages_block(zone, page, start_migratetype); @@@ -1078,14 -969,11 +1078,14 @@@ rmv_page_order(page); /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order) + if (current_order >= pageblock_order && + !is_migrate_cma(migratetype)) change_pageblock_range(page, current_order, start_migratetype); - expand(zone, page, order, current_order, area, migratetype); + expand(zone, page, order, current_order, area, + is_migrate_cma(migratetype) + ? migratetype : start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); @@@ -1127,17 -1015,17 +1127,17 @@@ retry_reserve return page; } -/* +/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, +static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int i; - + int mt = migratetype, i; + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); @@@ -1157,12 -1045,7 +1157,12 @@@ list_add(&page->lru, list); else list_add_tail(&page->lru, list); - set_page_private(page, migratetype); + if (IS_ENABLED(CONFIG_CMA)) { + mt = get_pageblock_migratetype(page); + if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) + mt = migratetype; + } + set_page_private(page, mt); list = &page->lru; } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); @@@ -1327,19 -1210,6 +1327,19 @@@ out local_irq_restore(flags); } +/* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, int cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_page_free_batched(page, cold); + free_hot_cold_page(page, cold); + } +} + /* * split_page takes a non-compound higher-order page, and splits it into * n (1<= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; - for (; page < endpage; page += pageblock_nr_pages) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } } return 1 << order; @@@ -1542,7 -1408,7 +1542,7 @@@ static int should_fail_alloc_page(gfp_ static int __init fail_page_alloc_debugfs(void) { - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, @@@ -1591,7 -1457,7 +1591,7 @@@ static bool __zone_watermark_ok(struct long min = mark; int o; - free_pages -= (1 << order) + 1; + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) @@@ -1890,8 -1756,7 +1890,8 @@@ void warn_alloc_failed(gfp_t gfp_mask, { unsigned int filter = SHOW_MEM_FILTER_NODES; - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) return; /* @@@ -1930,25 -1795,12 +1930,25 @@@ static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other @@@ -1967,6 -1819,13 +1967,6 @@@ if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; - /* - * Don't let big-order allocations loop unless the caller - * explicitly requests that. - */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - return 0; } @@@ -2094,13 -1953,16 +2094,13 @@@ __alloc_pages_direct_compact(gfp_t gfp_ } #endif /* CONFIG_COMPACTION */ -/* The really slow allocator path where we enter direct reclaim */ -static inline struct page * -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, + nodemask_t *nodemask) { - struct page *page = NULL; struct reclaim_state reclaim_state; - bool drained = false; + int progress; cond_resched(); @@@ -2111,7 -1973,7 +2111,7 @@@ reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@@ -2119,21 -1981,6 +2119,21 @@@ cond_resched(); + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + bool drained = false; + + *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + nodemask); if (unlikely(!(*did_some_progress))) return NULL; @@@ -2395,8 -2242,7 +2395,8 @@@ rebalance /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; @@@ -2518,6 -2364,16 +2518,6 @@@ unsigned long get_zeroed_page(gfp_t gfp } EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) { - trace_mm_pagevec_free(pvec->pages[i], pvec->cold); - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -} - void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { @@@ -4397,10 -4253,11 +4397,11 @@@ static void __meminit calculate_node_to * round what is now in bits to nearest long in bits, then return it in * bytes. */ - static unsigned long __init usemap_size(unsigned long zonesize) + static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; + zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; @@@ -4410,17 -4267,19 +4411,19 @@@ } static void __init setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) { - unsigned long usemapsize = usemap_size(zonesize); + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, usemapsize); } #else - static inline void setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) {} + static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@@ -4479,7 -4338,7 +4482,7 @@@ static void __paginginit free_area_init init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); - + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; @@@ -4545,7 -4404,7 +4548,7 @@@ continue; set_pageblock_order(); - setup_usemap(pgdat, zone, size); + setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); @@@ -5269,19 -5128,8 +5272,19 @@@ static void calculate_totalreserve_page if (max > zone->present_pages) max = zone->present_pages; reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; } } + dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@@ -5324,7 -5172,14 +5327,7 @@@ static void setup_per_zone_lowmem_reser calculate_totalreserve_pages(); } -/** - * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-{added|removed} - * - * Ensures that the watermark[min,low,high] values for each zone are set - * correctly with respect to min_free_kbytes. - */ -void setup_per_zone_wmarks(void) +static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@@ -5371,11 -5226,6 +5374,11 @@@ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + + zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); + zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); + zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@@ -5384,20 -5234,6 +5387,20 @@@ calculate_totalreserve_pages(); } +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + mutex_lock(&zonelists_mutex); + __setup_per_zone_wmarks(); + mutex_unlock(&zonelists_mutex); +} + /* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance @@@ -5771,16 -5607,14 +5774,16 @@@ static in __count_immobile_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; + int mt; + /* * For avoiding noise data, lru_add_drain_all() should be called * If ZONE_MOVABLE, the zone never contains immobile pages */ if (zone_idx(zone) == ZONE_MOVABLE) return true; - - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + mt = get_pageblock_migratetype(page); + if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) return true; pfn = page_to_pfn(page); @@@ -5890,7 -5724,7 +5893,7 @@@ out return ret; } -void unset_migratetype_isolate(struct page *page) +void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags; @@@ -5898,264 -5732,12 +5901,264 @@@ spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); + move_freepages_block(zone, page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn) +{ + return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) - 1); +} + +static unsigned long pfn_max_align_up(unsigned long pfn) +{ + return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages)); +} + +static struct page * +__alloc_contig_migrate_alloc(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} + +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + migrate_prep_local(); + + while (pfn < end || !list_empty(&cc.migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc.migratepages)) { + cc.nr_migratepages = 0; + pfn = isolate_migratepages_range(cc.zone, &cc, + pfn, end); + if (!pfn) { + ret = -EINTR; + break; + } + tries = 0; + } else if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + break; + } + + ret = migrate_pages(&cc.migratepages, + __alloc_contig_migrate_alloc, + 0, false, MIGRATE_SYNC); + } + + putback_lru_pages(&cc.migratepages); + return ret > 0 ? 0 : ret; +} + +/* + * Update zone's cma pages counter used for watermark level calculation. + */ +static inline void __update_cma_watermarks(struct zone *zone, int count) +{ + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); + zone->min_cma_pages += count; + spin_unlock_irqrestore(&zone->lock, flags); + setup_per_zone_wmarks(); +} + +/* + * Trigger memory pressure bump to reclaim some pages in order to be able to + * allocate 'count' pages in single page units. Does similar work as + *__alloc_pages_slowpath() function. + */ +static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zonelist *zonelist = node_zonelist(0, gfp_mask); + int did_some_progress = 0; + int order = 1; + + /* + * Increase level of watermarks to force kswapd do his job + * to stabilise at new watermark level. + */ + __update_cma_watermarks(zone, count); + + /* Obey watermarks as if the page was being allocated */ + while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); + + did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + NULL); + if (!did_some_progress) { + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order, NULL); + } + } + + /* Restore original watermark levels. */ + __update_cma_watermarks(zone, -count); + + return count; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) +{ + struct zone *zone = page_zone(pfn_to_page(start)); + unsigned long outer_start, outer_end; + int ret = 0, order; + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, we align the range to biggest of the two pages so + * that page allocator won't try to merge buddies from + * different pageblocks and change MIGRATE_ISOLATE to some + * other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. + */ + + ret = start_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + if (ret) + goto done; + + ret = __alloc_contig_migrate_range(start, end); + if (ret) + goto done; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ + + lru_add_drain_all(); + drain_all_pages(); + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + ret = -EBUSY; + goto done; + } + outer_start &= ~0UL << order; + } + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end)) { + pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", + outer_start, end); + ret = -EBUSY; + goto done; + } + + /* + * Reclaim enough pages to make sure that contiguous allocation + * will not starve the system. + */ + __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); + + /* Grab isolated pages from freelists. */ + outer_end = isolate_freepages_range(outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + return ret; +} + +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + for (; nr_pages--; ++pfn) + __free_page(pfn_to_page(pfn)); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. diff --combined mm/shmem.c index 139b8ad44021,a78acf073a68..97f3f8808219 --- a/mm/shmem.c +++ b/mm/shmem.c @@@ -2121,6 -2121,7 +2121,7 @@@ static int shmem_remount_fs(struct supe unsigned long inodes; int error = -EINVAL; + config.mpol = NULL; if (shmem_parse_options(data, &config, true)) return error; @@@ -2145,8 -2146,13 +2146,13 @@@ sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; - mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (config.mpol) { + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ + } out: spin_unlock(&sbinfo->stat_lock); return error; @@@ -2584,7 -2590,6 +2590,7 @@@ int shmem_zero_setup(struct vm_area_str vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } +EXPORT_SYMBOL_GPL(shmem_zero_setup); /** * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.