Also note the kernel might malfunction if you disable
some critical bits.
+ cma=nn[MG] [ARM,KNL]
+ Sets the size of kernel global memory area for contiguous
+ memory allocations. For more information, see
+ include/linux/dma-contiguous.h
+
cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments
a hypervisor.
Default: yes
+ coherent_pool=nn[KMG] [ARM,KNL]
+ Sets the size of memory pool for coherent, atomic dma
+ allocations if Contiguous Memory Allocator (CMA) is used.
+
code_bytes [X86] How many bytes of object code to print
in an oops report.
Range: 0 - 8192
UART at the specified I/O port or MMIO address,
switching to the matching ttyS device later. The
options are the same as for ttyS, above.
+ hvc<n> Use the hypervisor console device <n>. This is for
+ both Xen and PowerPC hypervisors.
If the device connected to the port is not a TTY but a braille
device, prepend "brl," before the device type, for instance
no_debug_objects
[KNL] Disable object debugging
+ debug_guardpage_minorder=
+ [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+ parameter allows control of the order of pages that will
+ be intentionally kept free (and hence protected) by the
+ buddy allocator. Bigger value increase the probability
+ of catching random memory corruption, but reduce the
+ amount of memory for normal system use. The maximum
+ possible value is MAX_ORDER/2. Setting this parameter
+ to 1 or 2 should be enough to identify most random
+ memory corruption problems caused by bugs in kernel or
+ driver code when a CPU writes to (or reads from) a
+ random memory location. Note that there exists a class
+ of memory corruptions problems caused by buggy H/W or
+ F/W or by drivers badly programing DMA (basically when
+ memory is written at bus level and the CPU MMU is
+ bypassed) which are not detectable by
+ CONFIG_DEBUG_PAGEALLOC, hence this option will not help
+ tracking down these problems.
+
debugpat [X86] Enable PAT debugging
decnet.addr= [HW,NET]
earlyprintk= [X86,SH,BLACKFIN]
earlyprintk=vga
+ earlyprintk=xen
earlyprintk=serial[,ttySn[,baudrate]]
earlyprintk=ttySn[,baudrate]
earlyprintk=dbgp[debugController#]
The VGA output is eventually overwritten by the real
console.
+ The xen output can only be used by Xen PV guests.
+
ekgdboc= [X86,KGDB] Allow early kernel console debugging
ekgdboc=kbd
i8k.restricted [HW] Allow controlling fans only if SYS_ADMIN
capability is set.
+ i915.invert_brightness=
+ [DRM] Invert the sense of the variable that is used to
+ set the brightness of the panel backlight. Normally a
+ brightness value of 0 indicates backlight switched off,
+ and the maximum of the brightness value sets the backlight
+ to maximum brightness. If this parameter is set to 0
+ (default) and the machine requires it, or this parameter
+ is set to 1, a brightness value of 0 sets the backlight
+ to maximum brightness, and the maximum of the brightness
+ value switches the backlight off.
+ -1 -- never invert brightness
+ 0 -- machine default
+ 1 -- force brightness inversion
+
icn= [HW,ISDN]
Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
unsigned long instr = 0, instrptr;
int (*handler)(unsigned long addr, unsigned long instr, struct pt_regs *regs);
unsigned int type;
- mm_segment_t fs;
unsigned int fault;
u16 tinstr = 0;
int isize = 4;
instrptr = instruction_pointer(regs);
- fs = get_fs();
- set_fs(KERNEL_DS);
if (thumb_mode(regs)) {
- fault = __get_user(tinstr, (u16 *)(instrptr & ~1));
+ u16 *ptr = (u16 *)(instrptr & ~1);
+ fault = probe_kernel_address(ptr, tinstr);
if (!fault) {
if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
IS_T32(tinstr)) {
/* Thumb-2 32-bit */
u16 tinst2 = 0;
- fault = __get_user(tinst2, (u16 *)(instrptr+2));
+ fault = probe_kernel_address(ptr + 1, tinst2);
instr = (tinstr << 16) | tinst2;
thumb2_32b = 1;
} else {
}
}
} else
- fault = __get_user(instr, (u32 *)instrptr);
- set_fs(fs);
+ fault = probe_kernel_address(instrptr, instr);
if (fault) {
type = TYPE_FAULT;
ai_usermode = safe_usermode(ai_usermode, false);
}
- hook_fault_code(1, do_alignment, SIGBUS, BUS_ADRALN,
+ hook_fault_code(FAULT_CODE_ALIGNMENT, do_alignment, SIGBUS, BUS_ADRALN,
"alignment exception");
/*
* loc: location to jump to for soft reset
*/
.align 5
+ .pushsection .idmap.text, "ax"
ENTRY(cpu_arm920_reset)
mov ip, #0
mcr p15, 0, ip, c7, c7, 0 @ invalidate I,D caches
bic ip, ip, #0x1100 @ ...i...s........
mcr p15, 0, ip, c1, c0, 0 @ ctrl register
mov pc, r0
+ENDPROC(cpu_arm920_reset)
+ .popsection
/*
* cpu_arm920_do_idle()
/* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */
.globl cpu_arm920_suspend_size
.equ cpu_arm920_suspend_size, 4 * 3
- #ifdef CONFIG_PM_SLEEP
+ #ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_arm920_do_suspend)
stmfd sp!, {r4 - r6, lr}
mrc p15, 0, r4, c13, c0, 0 @ PID
* loc: location to jump to for soft reset
*/
.align 5
+ .pushsection .idmap.text, "ax"
ENTRY(cpu_arm926_reset)
mov ip, #0
mcr p15, 0, ip, c7, c7, 0 @ invalidate I,D caches
bic ip, ip, #0x1100 @ ...i...s........
mcr p15, 0, ip, c1, c0, 0 @ ctrl register
mov pc, r0
+ENDPROC(cpu_arm926_reset)
+ .popsection
/*
* cpu_arm926_do_idle()
/* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */
.globl cpu_arm926_suspend_size
.equ cpu_arm926_suspend_size, 4 * 3
- #ifdef CONFIG_PM_SLEEP
+ #ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_arm926_do_suspend)
stmfd sp!, {r4 - r6, lr}
mrc p15, 0, r4, c13, c0, 0 @ PID
* loc: location to jump to for soft reset
*/
.align 5
+ .pushsection .idmap.text, "ax"
ENTRY(cpu_sa1100_reset)
mov ip, #0
mcr p15, 0, ip, c7, c7, 0 @ invalidate I,D caches
bic ip, ip, #0x1100 @ ...i...s........
mcr p15, 0, ip, c1, c0, 0 @ ctrl register
mov pc, r0
+ENDPROC(cpu_sa1100_reset)
+ .popsection
/*
* cpu_sa1100_do_idle(type)
.globl cpu_sa1100_suspend_size
.equ cpu_sa1100_suspend_size, 4 * 3
- #ifdef CONFIG_PM_SLEEP
+ #ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_sa1100_do_suspend)
stmfd sp!, {r4 - r6, lr}
mrc p15, 0, r4, c3, c0, 0 @ domain ID
* - loc - location to jump to for soft reset
*/
.align 5
+ .pushsection .idmap.text, "ax"
ENTRY(cpu_v6_reset)
mrc p15, 0, r1, c1, c0, 0 @ ctrl register
bic r1, r1, #0x1 @ ...............m
mov r1, #0
mcr p15, 0, r1, c7, c5, 4 @ ISB
mov pc, r0
+ENDPROC(cpu_v6_reset)
+ .popsection
/*
* cpu_v6_do_idle()
/* Suspend/resume support: taken from arch/arm/mach-s3c64xx/sleep.S */
.globl cpu_v6_suspend_size
.equ cpu_v6_suspend_size, 4 * 6
- #ifdef CONFIG_PM_SLEEP
+ #ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_v6_do_suspend)
stmfd sp!, {r4 - r9, lr}
mrc p15, 0, r4, c13, c0, 0 @ FCSE/PID
* loc: location to jump to for soft reset
*/
.align 5
+ .pushsection .idmap.text, "ax"
ENTRY(cpu_xsc3_reset)
mov r1, #PSR_F_BIT|PSR_I_BIT|SVC_MODE
msr cpsr_c, r1 @ reset CPSR
@ already containing those two last instructions to survive.
mcr p15, 0, ip, c8, c7, 0 @ invalidate I and D TLBs
mov pc, r0
+ENDPROC(cpu_xsc3_reset)
+ .popsection
/*
* cpu_xsc3_do_idle()
.globl cpu_xsc3_suspend_size
.equ cpu_xsc3_suspend_size, 4 * 6
- #ifdef CONFIG_PM_SLEEP
+ #ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_xsc3_do_suspend)
stmfd sp!, {r4 - r9, lr}
mrc p14, 0, r4, c6, c0, 0 @ clock configuration, for turbo mode
* Beware PXA270 erratum E7.
*/
.align 5
+ .pushsection .idmap.text, "ax"
ENTRY(cpu_xscale_reset)
mov r1, #PSR_F_BIT|PSR_I_BIT|SVC_MODE
msr cpsr_c, r1 @ reset CPSR
@ already containing those two last instructions to survive.
mcr p15, 0, ip, c8, c7, 0 @ invalidate I & D TLBs
mov pc, r0
+ENDPROC(cpu_xscale_reset)
+ .popsection
/*
* cpu_xscale_do_idle()
.globl cpu_xscale_suspend_size
.equ cpu_xscale_suspend_size, 4 * 6
- #ifdef CONFIG_PM_SLEEP
+ #ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_xscale_do_suspend)
stmfd sp!, {r4 - r9, lr}
mrc p14, 0, r4, c6, c0, 0 @ clock configuration, for turbo mode
select IRQ_FORCED_THREADING
select USE_GENERIC_SMP_HELPERS if SMP
select HAVE_BPF_JIT if (X86_64 && NET)
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
depends on SMP
- depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
+ depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && BROKEN)
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
---help---
Enable NUMA (Non Uniform Memory Access) support.
int cifsERROR = 1;
int traceSMB = 0;
bool enable_oplocks = true;
+bool no_serverino_autodisable = false;
unsigned int linuxExtEnabled = 1;
unsigned int lookupCacheEnabled = 1;
unsigned int multiuser_mount = 0;
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
"y/Y/1");
+module_param(no_serverino_autodisable, bool, 0644);
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
+ /*
+ * Bumps refcount for cifs super block.
+ * Note that it should be only called if a referece to VFS super block is
+ * already held, e.g. in open-type syscalls context. Otherwise it can race with
+ * atomic_dec_and_test in deactivate_locked_super.
+ */
+ void
+ cifs_sb_active(struct super_block *sb)
+ {
+ struct cifs_sb_info *server = CIFS_SB(sb);
+
+ if (atomic_inc_return(&server->active) == 1)
+ atomic_inc(&sb->s_active);
+ }
+
+ void
+ cifs_sb_deactive(struct super_block *sb)
+ {
+ struct cifs_sb_info *server = CIFS_SB(sb);
+
+ if (atomic_dec_and_test(&server->active))
+ deactivate_super(sb);
+ }
+
static int
cifs_read_super(struct super_block *sb)
{
dentry = ERR_PTR(-ENOENT);
break;
}
+ if (!S_ISDIR(dir->i_mode)) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOTDIR);
+ break;
+ }
/* skip separators */
while (*s == sep)
struct linux_binfmt *fmt;
pid_t old_pid;
+ /* This allows 4 levels of binfmt rewrites before failing hard. */
+ if (depth > 5)
+ return -ELOOP;
+
retval = security_bprm_check(bprm);
if (retval)
return retval;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
+ bprm->recursion_depth = depth + 1;
retval = fn(bprm, regs);
- /*
- * Restore the depth counter to its starting value
- * in this call, so we don't have to rely on every
- * load_binary function to restore it on return.
- */
bprm->recursion_depth = depth;
if (retval >= 0) {
if (depth == 0)
fd_install(0, rp);
spin_lock(&cf->file_lock);
fdt = files_fdtable(cf);
- FD_SET(0, fdt->open_fds);
- FD_CLR(0, fdt->close_on_exec);
+ __set_open_fd(0, fdt);
+ __clear_close_on_exec(0, fdt);
spin_unlock(&cf->file_lock);
/* and disallow core files too */
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+EXPORT_SYMBOL(inode_sb_list_lock);
/*
* Empty aops. Can be used for the cases where the user does not
* inode to the back of the list so we don't spin on it.
*/
if (!spin_trylock(&inode->i_lock)) {
- list_move_tail(&inode->i_lru, &sb->s_inode_lru);
+ list_move(&inode->i_lru, &sb->s_inode_lru);
continue;
}
* than the maximum number of orphans allowed.
*/
-#ifdef CONFIG_UBIFS_FS_DEBUG
static int dbg_check_orphans(struct ubifs_info *c);
-#else
-#define dbg_check_orphans(c) 0
-#endif
/**
* ubifs_add_orphan - add an orphan.
else if (inum > o->inum)
p = &(*p)->rb_right;
else {
- dbg_err("orphaned twice");
+ ubifs_err("orphaned twice");
spin_unlock(&c->orphan_lock);
kfree(orphan);
return 0;
else if (inum > o->inum)
p = p->rb_right;
else {
- if (o->dnext) {
+ if (o->del) {
spin_unlock(&c->orphan_lock);
dbg_gen("deleted twice ino %lu",
(unsigned long)inum);
return;
}
if (o->cnext) {
+ o->del = 1;
o->dnext = c->orph_dnext;
c->orph_dnext = o;
spin_unlock(&c->orphan_lock);
}
}
spin_unlock(&c->orphan_lock);
- dbg_err("missing orphan ino %lu", (unsigned long)inum);
- dbg_dump_stack();
+ ubifs_err("missing orphan ino %lu", (unsigned long)inum);
+ dump_stack();
}
/**
ubifs_assert(c->ohead_offs == 0);
ubifs_prepare_node(c, c->orph_buf, len, 1);
len = ALIGN(len, c->min_io_size);
- err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len);
} else {
if (c->ohead_offs == 0) {
/* Ensure LEB has been unmapped */
return err;
}
err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
- c->ohead_offs, UBI_SHORTTERM);
+ c->ohead_offs);
}
return err;
}
orphan = dnext;
dnext = orphan->dnext;
ubifs_assert(!orphan->new);
+ ubifs_assert(orphan->del);
rb_erase(&orphan->rb, &c->orph_tree);
list_del(&orphan->list);
c->tot_orphans -= 1;
rb_link_node(&orphan->rb, parent, p);
rb_insert_color(&orphan->rb, &c->orph_tree);
list_add_tail(&orphan->list, &c->orph_list);
+ orphan->del = 1;
orphan->dnext = c->orph_dnext;
c->orph_dnext = orphan;
dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
list_for_each_entry(snod, &sleb->nodes, list) {
if (snod->type != UBIFS_ORPH_NODE) {
- ubifs_err("invalid node type %d in orphan area at "
- "%d:%d", snod->type, sleb->lnum, snod->offs);
- dbg_dump_node(c, snod->node);
+ ubifs_err("invalid node type %d in orphan area at %d:%d",
+ snod->type, sleb->lnum, snod->offs);
+ ubifs_dump_node(c, snod->node);
return -EINVAL;
}
* number. That makes this orphan node, out of date.
*/
if (!first) {
- ubifs_err("out of order commit number %llu in "
- "orphan node at %d:%d",
+ ubifs_err("out of order commit number %llu in orphan node at %d:%d",
cmt_no, sleb->lnum, snod->offs);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
return -EINVAL;
}
dbg_rcvry("out of date LEB %d", sleb->lnum);
return err;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
struct check_orphan {
struct rb_node rb;
kfree(ci.node);
return err;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
return 5;
if (!ubifs_compr_present(ui->compr_type)) {
- ubifs_warn("inode %lu uses '%s' compression, but it was not "
- "compiled in", inode->i_ino,
- ubifs_compr_name(ui->compr_type));
+ ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in",
+ inode->i_ino, ubifs_compr_name(ui->compr_type));
}
err = dbg_check_dir(c, inode);
out_invalid:
ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
- dbg_dump_node(c, ino);
- dbg_dump_inode(c, inode);
+ ubifs_dump_node(c, ino);
+ ubifs_dump_inode(c, inode);
err = -EINVAL;
out_ino:
kfree(ino);
tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
tmp = ALIGN(tmp, c->min_io_size);
if (tmp > c->leb_size) {
- dbg_err("too small LEB size %d, at least %d needed",
- c->leb_size, tmp);
+ ubifs_err("too small LEB size %d, at least %d needed",
+ c->leb_size, tmp);
return -EINVAL;
}
tmp /= c->leb_size;
tmp += 1;
if (c->log_lebs < tmp) {
- dbg_err("too small log %d LEBs, required min. %d LEBs",
- c->log_lebs, tmp);
+ ubifs_err("too small log %d LEBs, required min. %d LEBs",
+ c->log_lebs, tmp);
return -EINVAL;
}
c->jheads[i].grouped = 1;
}
- c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
/*
- * Garbage Collector head likely contains long-term data and
- * does not need to be synchronized by timer. Also GC head nodes are
- * not grouped.
+ * Garbage Collector head does not need to be synchronized by timer.
+ * Also GC head nodes are not grouped.
*/
- c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
c->jheads[GCHD].wbuf.no_timer = 1;
c->jheads[GCHD].grouped = 0;
orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
list_del(&orph->list);
kfree(orph);
- dbg_err("orphan list not empty at unmount");
+ ubifs_err("orphan list not empty at unmount");
}
vfree(c->orph_buf);
flag = parse_standard_option(p);
if (!flag) {
- ubifs_err("unrecognized mount option \"%s\" "
- "or missing value", p);
+ ubifs_err("unrecognized mount option \"%s\" or missing value",
+ p);
return -EINVAL;
}
sb->s_flags |= flag;
}
/* Just disable bulk-read */
- ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, "
- "disabling it", c->max_bu_buf_len);
+ ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it",
+ c->max_bu_buf_len);
c->mount_opts.bulk_read = 1;
c->bulk_read = 0;
return;
ubifs_assert(c->dark_wm > 0);
if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
ubifs_err("insufficient free space to mount in R/W mode");
- dbg_dump_budg(c, &c->bi);
- dbg_dump_lprops(c);
+ ubifs_dump_budg(c, &c->bi);
+ ubifs_dump_lprops(c);
return -ENOSPC;
}
return 0;
static int mount_ubifs(struct ubifs_info *c)
{
int err;
- long long x;
+ long long x, y;
size_t sz;
c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
if (!c->ro_mount && c->space_fixup) {
err = ubifs_fixup_free_space(c);
if (err)
- goto out_master;
+ goto out_lpt;
}
if (!c->ro_mount) {
c->mounting = 0;
- ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
- c->vi.ubi_num, c->vi.vol_id, c->vi.name);
- if (c->ro_mount)
- ubifs_msg("mounted read-only");
+ ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
+ c->vi.ubi_num, c->vi.vol_id, c->vi.name,
+ c->ro_mount ? ", R/O mode" : NULL);
x = (long long)c->main_lebs * c->leb_size;
- ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "
- "LEBs)", x, x >> 10, x >> 20, c->main_lebs);
- x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
- ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
- "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
- ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
+ y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+ ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
+ c->leb_size, c->leb_size >> 10, c->min_io_size,
+ c->max_write_size);
+ ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
+ x, x >> 20, c->main_lebs,
+ y, y >> 20, c->log_lebs + c->max_bud_cnt);
+ ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
+ c->report_rp_size, c->report_rp_size >> 10);
+ ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s",
c->fmt_version, c->ro_compat_version,
- UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
- ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
- ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
- c->report_rp_size, c->report_rp_size >> 10);
-
- dbg_msg("compiled on: " __DATE__ " at " __TIME__);
- dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
- dbg_msg("max. write size: %d bytes", c->max_write_size);
- dbg_msg("LEB size: %d bytes (%d KiB)",
- c->leb_size, c->leb_size >> 10);
- dbg_msg("data journal heads: %d",
+ UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid,
+ c->big_lpt ? ", big LPT model" : ", small LPT model");
+
+ dbg_gen("default compressor: %s", ubifs_compr_name(c->default_compr));
+ dbg_gen("data journal heads: %d",
c->jhead_cnt - NONDATA_JHEADS_CNT);
- dbg_msg("UUID: %pUB", c->uuid);
- dbg_msg("big_lpt %d", c->big_lpt);
- dbg_msg("log LEBs: %d (%d - %d)",
+ dbg_gen("log LEBs: %d (%d - %d)",
c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
- dbg_msg("LPT area LEBs: %d (%d - %d)",
+ dbg_gen("LPT area LEBs: %d (%d - %d)",
c->lpt_lebs, c->lpt_first, c->lpt_last);
- dbg_msg("orphan area LEBs: %d (%d - %d)",
+ dbg_gen("orphan area LEBs: %d (%d - %d)",
c->orph_lebs, c->orph_first, c->orph_last);
- dbg_msg("main area LEBs: %d (%d - %d)",
+ dbg_gen("main area LEBs: %d (%d - %d)",
c->main_lebs, c->main_first, c->leb_cnt - 1);
- dbg_msg("index LEBs: %d", c->lst.idx_lebs);
- dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
+ dbg_gen("index LEBs: %d", c->lst.idx_lebs);
+ dbg_gen("total index bytes: %lld (%lld KiB, %lld MiB)",
c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
c->bi.old_idx_sz >> 20);
- dbg_msg("key hash type: %d", c->key_hash_type);
- dbg_msg("tree fanout: %d", c->fanout);
- dbg_msg("reserved GC LEB: %d", c->gc_lnum);
- dbg_msg("first main LEB: %d", c->main_first);
- dbg_msg("max. znode size %d", c->max_znode_sz);
- dbg_msg("max. index node size %d", c->max_idx_node_sz);
- dbg_msg("node sizes: data %zu, inode %zu, dentry %zu",
+ dbg_gen("key hash type: %d", c->key_hash_type);
+ dbg_gen("tree fanout: %d", c->fanout);
+ dbg_gen("reserved GC LEB: %d", c->gc_lnum);
+ dbg_gen("max. znode size %d", c->max_znode_sz);
+ dbg_gen("max. index node size %d", c->max_idx_node_sz);
+ dbg_gen("node sizes: data %zu, inode %zu, dentry %zu",
UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
- dbg_msg("node sizes: trun %zu, sb %zu, master %zu",
+ dbg_gen("node sizes: trun %zu, sb %zu, master %zu",
UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
- dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
+ dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu",
UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
- dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
+ dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
- dbg_msg("dead watermark: %d", c->dead_wm);
- dbg_msg("dark watermark: %d", c->dark_wm);
- dbg_msg("LEB overhead: %d", c->leb_overhead);
+ dbg_gen("dead watermark: %d", c->dead_wm);
+ dbg_gen("dark watermark: %d", c->dark_wm);
+ dbg_gen("LEB overhead: %d", c->leb_overhead);
x = (long long)c->main_lebs * c->dark_wm;
- dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
+ dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)",
x, x >> 10, x >> 20);
- dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
+ dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
c->max_bud_bytes, c->max_bud_bytes >> 10,
c->max_bud_bytes >> 20);
- dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+ dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
c->bg_bud_bytes, c->bg_bud_bytes >> 10,
c->bg_bud_bytes >> 20);
- dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)",
+ dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)",
c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
- dbg_msg("max. seq. number: %llu", c->max_sqnum);
- dbg_msg("commit number: %llu", c->cmt_no);
+ dbg_gen("max. seq. number: %llu", c->max_sqnum);
+ dbg_gen("commit number: %llu", c->cmt_no);
return 0;
if (c->rw_incompat) {
ubifs_err("the file-system is not R/W-compatible");
- ubifs_msg("on-flash format version is w%d/r%d, but software "
- "only supports up to version w%d/r%d", c->fmt_version,
- c->ro_compat_version, UBIFS_FORMAT_VERSION,
- UBIFS_RO_COMPAT_VERSION);
+ ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+ c->fmt_version, c->ro_compat_version,
+ UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
return -EROFS;
}
c->remounting_rw = 1;
c->ro_mount = 0;
+ if (c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ return err;
+ }
+
err = check_free_space(c);
if (err)
goto out;
err = dbg_check_space_info(c);
}
- if (c->space_fixup) {
- err = ubifs_fixup_free_space(c);
- if (err)
- goto out;
- }
-
mutex_unlock(&c->umount_mutex);
return err;
* next mount, so we just print a message and
* continue to unmount normally.
*/
- ubifs_err("failed to write master node, "
- "error %d", err);
+ ubifs_err("failed to write master node, error %d",
+ err);
} else {
for (i = 0; i < c->jhead_cnt; i++)
/* Make sure write-buffer timers are canceled */
*/
ubi = open_ubi(name, UBI_READONLY);
if (IS_ERR(ubi)) {
- dbg_err("cannot open \"%s\", error %d",
- name, (int)PTR_ERR(ubi));
+ ubifs_err("cannot open \"%s\", error %d",
+ name, (int)PTR_ERR(ubi));
return ERR_CAST(ubi);
}
* UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
*/
if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
- ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
- " at least 4096 bytes",
+ ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
(unsigned int)PAGE_CACHE_SIZE);
return -EINVAL;
}
#define UBIFS_VERSION 1
/* Normal UBIFS messages */
-#define ubifs_msg(fmt, ...) \
- printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__)
/* UBIFS error messages */
-#define ubifs_err(fmt, ...) \
- printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+#define ubifs_err(fmt, ...) \
+ pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
__func__, ##__VA_ARGS__)
/* UBIFS warning messages */
-#define ubifs_warn(fmt, ...) \
- printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
+#define ubifs_warn(fmt, ...) \
+ pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \
+ current->pid, __func__, ##__VA_ARGS__)
/* UBIFS file system VFS magic number */
#define UBIFS_SUPER_MAGIC 0x24051905
#define INUM_WARN_WATERMARK 0xFFF00000
#define INUM_WATERMARK 0xFFFFFF00
-/* Largest key size supported in this implementation */
-#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
-
/* Maximum number of entries in each LPT (LEB category) heap */
#define LPT_HEAP_SZ 256
/* The below union makes it easier to deal with keys */
union ubifs_key {
- uint8_t u8[CUR_MAX_KEY_LEN];
- uint32_t u32[CUR_MAX_KEY_LEN/4];
- uint64_t u64[CUR_MAX_KEY_LEN/8];
- __le32 j32[CUR_MAX_KEY_LEN/4];
+ uint8_t u8[UBIFS_SK_LEN];
+ uint32_t u32[UBIFS_SK_LEN/4];
+ uint64_t u64[UBIFS_SK_LEN/8];
+ __le32 j32[UBIFS_SK_LEN/4];
};
/**
* @avail: number of bytes available in the write-buffer
* @used: number of used bytes in the write-buffer
* @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
- * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
- * %UBI_UNKNOWN)
* @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
* up by 'mutex_lock_nested()).
* @sync_callback: write-buffer synchronization callback
int avail;
int used;
int size;
- int dtype;
int jhead;
int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
struct mutex io_mutex;
* @offs: offset of the corresponding indexing node
* @len: length of the corresponding indexing node
* @zbranch: array of znode branches (@c->fanout elements)
+ *
+ * Note! The @lnum, @offs, and @len fields are not really needed - we have them
+ * only for internal consistency check. They could be removed to save some RAM.
*/
struct ubifs_znode {
struct ubifs_znode *parent;
int child_cnt;
int iip;
int alt;
-#ifdef CONFIG_UBIFS_FS_DEBUG
- int lnum, offs, len;
-#endif
+ int lnum;
+ int offs;
+ int len;
struct ubifs_zbranch zbranch[];
};
* @dnext: next orphan to delete
* @inum: inode number
* @new: %1 => added since the last commit, otherwise %0
+ * @del: %1 => delete pending, otherwise %0
*/
struct ubifs_orphan {
struct rb_node rb;
struct ubifs_orphan *dnext;
ino_t inum;
int new;
+ unsigned del:1;
};
/**
struct rb_root size_tree;
struct ubifs_mount_opts mount_opts;
-#ifdef CONFIG_UBIFS_FS_DEBUG
struct ubifs_debug_info *dbg;
-#endif
};
extern struct list_head ubifs_infos;
int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
int len, int even_ebadmsg);
int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
- int len, int dtype);
-int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
- int dtype);
+ int len);
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
int ubifs_leb_unmap(struct ubifs_info *c, int lnum);
-int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype);
+int ubifs_leb_map(struct ubifs_info *c, int lnum);
int ubifs_is_mapped(const struct ubifs_info *c, int lnum);
int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
-int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
- int dtype);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs);
int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
int lnum, int offs);
int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int lnum, int offs);
int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
- int offs, int dtype);
+ int offs);
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
int offs, int quiet, int must_chk_crc);
void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
unsigned long pfn);
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
+ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
+
struct page *follow_page(struct vm_area_struct *, unsigned long address,
unsigned int foll_flags);
unsigned int pages_per_huge_page);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+extern unsigned int _debug_guardpage_minorder;
+
+static inline unsigned int debug_guardpage_minorder(void)
+{
+ return _debug_guardpage_minorder;
+}
+
+static inline bool page_is_guard(struct page *page)
+{
+ return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+static inline bool page_is_guard(struct page *page) { return false; }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
extern int block_dump;
extern int laptop_mode;
-extern unsigned long determine_dirtyable_memory(void);
-
extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
+ void account_page_redirty(struct page *page);
+
/* pdflush.c */
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
read-only. */
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
}
+EXPORT_SYMBOL_GPL(can_nice);
#ifdef __ARCH_WANT_SYS_NICE
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
+ #ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+ #endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
return sighand;
}
+EXPORT_SYMBOL_GPL(__lock_task_sighand);
/*
* send signal info to all the members of a group
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct ring_buffer *buf = tr->buffer;
+ struct ring_buffer *buf;
if (trace_stop_count)
return;
}
arch_spin_lock(&ftrace_max_lock);
+ buf = tr->buffer;
tr->buffer = max_tr.buffer;
max_tr.buffer = buf;
return -EINVAL;
}
- static void set_tracer_flags(unsigned int mask, int enabled)
+ /* Some tracers require overwrite to stay enabled */
+ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+ {
+ if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ return -1;
+
+ return 0;
+ }
+
+ int set_tracer_flag(unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
if (!!(trace_flags & mask) == !!enabled)
- return;
+ return 0;
+
+ /* Give the tracer a chance to approve the change */
+ if (current_trace->flag_changed)
+ if (current_trace->flag_changed(current_trace, mask, !!enabled))
+ return -EINVAL;
if (enabled)
trace_flags |= mask;
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
- if (mask == TRACE_ITER_OVERWRITE)
+ if (mask == TRACE_ITER_OVERWRITE) {
ring_buffer_change_overwrite(global_trace.buffer, enabled);
+ #ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_change_overwrite(max_tr.buffer, enabled);
+ #endif
+ }
+
+ return 0;
}
static ssize_t
char buf[64];
char *cmp;
int neg = 0;
- int ret;
+ int ret = 0;
int i;
if (cnt >= sizeof(buf))
cmp += 2;
}
+ mutex_lock(&trace_types_lock);
+
for (i = 0; trace_options[i]; i++) {
if (strcmp(cmp, trace_options[i]) == 0) {
- set_tracer_flags(1 << i, !neg);
+ ret = set_tracer_flag(1 << i, !neg);
break;
}
}
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i]) {
- mutex_lock(&trace_types_lock);
+ if (!trace_options[i])
ret = set_tracer_option(current_trace, cmp, neg);
- mutex_unlock(&trace_types_lock);
- if (ret)
- return ret;
- }
+
+ mutex_unlock(&trace_types_lock);
+
+ if (ret)
+ return ret;
*ppos += cnt;
goto out;
trace_branch_disable();
+
+ current_trace->enabled = false;
+
if (current_trace && current_trace->reset)
current_trace->reset(tr);
if (current_trace && current_trace->use_max_tr) {
goto out;
}
+ current_trace->enabled = true;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
if (val != 0 && val != 1)
return -EINVAL;
- set_tracer_flags(1 << index, val);
+
+ mutex_lock(&trace_types_lock);
+ ret = set_tracer_flag(1 << index, val);
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
};
struct dentry *trace_create_file(const char *name,
- mode_t mode,
+ umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops)
trace_access_lock_init();
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("tracing_enabled", 0644, d_tracer,
&global_trace, &tracing_ctrl_fops);
iter->cpu_file = TRACE_PIPE_ALL_CPU;
}
- static void
- __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
+ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- static arch_spinlock_t ftrace_dump_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static atomic_t dump_running;
unsigned int old_userobj;
- static int dump_ran;
unsigned long flags;
int cnt = 0, cpu;
- /* only one dump */
- local_irq_save(flags);
- arch_spin_lock(&ftrace_dump_lock);
- if (dump_ran)
- goto out;
-
- dump_ran = 1;
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+ /*
+ * Always turn off tracing when we dump.
+ * We don't need to show trace output of what happens
+ * between multiple crashes.
+ *
+ * If the user does a sysrq-z, then they can re-enable
+ * tracing with echo 1 > tracing_on.
+ */
tracing_off();
- /* Did function tracer already get disabled? */
- if (ftrace_is_dead()) {
- printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- printk("# MAY BE MISSING FUNCTION EVENTS\n");
- }
-
- if (disable_tracing)
- ftrace_kill();
+ local_irq_save(flags);
trace_init_global_iter(&iter);
printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ /* Did function tracer already get disabled? */
+ if (ftrace_is_dead()) {
+ printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ printk("# MAY BE MISSING FUNCTION EVENTS\n");
+ }
+
/*
* We need to stop all tracing on all CPUS to read the
* the next buffer. This is a bit expensive, but is
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- /* Re-enable tracing if requested */
- if (!disable_tracing) {
- trace_flags |= old_userobj;
+ trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
- }
- tracing_on();
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
}
-
- out:
- arch_spin_unlock(&ftrace_dump_lock);
+ atomic_dec(&dump_running);
local_irq_restore(flags);
}
-
- /* By default: disable tracing after the dump */
- void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
- {
- __ftrace_dump(true, oops_dump_mode);
- }
+ EXPORT_SYMBOL_GPL(ftrace_dump);
__init static int tracer_alloc_buffers(void)
{
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
+ /* Return 0 if OK with change, else return non-zero */
+ int (*flag_changed)(struct tracer *tracer,
+ u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
int print_max;
int use_max_tr;
+ bool enabled;
};
void tracing_reset_current_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *trace_create_file(const char *name,
- mode_t mode,
+ umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
+ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+ int set_tracer_flag(unsigned int mask, int enabled);
+
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
extern struct ftrace_event_call \
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
} else {
+ arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
- * with node sysdevs in node_devices[] using a parallel array. The array
- * index of a node sysdev or _hstate == node id.
- * This is here to avoid any static dependency of the node sysdev driver, in
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
struct node_hstate node_hstates[MAX_NUMNODES];
/*
- * A subset of global hstate attributes for node sysdevs
+ * A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
};
/*
- * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
}
/*
- * Unregister hstate attributes from a single node sysdev.
+ * Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
}
/*
- * hugetlb module exit: unregister hstate attributes from node sysdevs
+ * hugetlb module exit: unregister hstate attributes from node devices
* that have them.
*/
static void hugetlb_unregister_all_nodes(void)
int nid;
/*
- * disable node sysdev registrations.
+ * disable node device registrations.
*/
register_hugetlbfs_with_node(NULL, NULL);
}
/*
- * Register hstate attributes for a single node sysdev.
+ * Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
void hugetlb_register_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- &node->sysdev.kobj);
+ &node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
if (err) {
printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
" for node %d\n",
- h->name, node->sysdev.id);
+ h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
/*
* hugetlb init time: register hstate attributes for all registered node
- * sysdevs of nodes that have memory. All on-line nodes should have
- * registered their associated sysdev by this time.
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
*/
static void hugetlb_register_all_nodes(void)
{
for_each_node_state(nid, N_HIGH_MEMORY) {
struct node *node = &node_devices[nid];
- if (node->sysdev.id == nid)
+ if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
- * Let the node sysdev driver know we're here so it can
+ * Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
- struct hstate *h = &default_hstate;
- return h->nr_huge_pages * pages_per_huge_page(h);
+ struct hstate *h;
+ unsigned long nr_total_pages = 0;
+
+ for_each_hstate(h)
+ nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+ return nr_total_pages;
}
static int hugetlb_acct_memory(struct hstate *h, long delta)
break;
}
- if (absent ||
+ /*
+ * We need call hugetlb_fault for both hugepages under migration
+ * (in which case hugetlb_fault waits for the migration,) and
+ * hwpoisoned hugepages (in which case we need to prevent the
+ * caller from accessing to them.) In order to do this, we use
+ * here is_swap_pte instead of is_hugetlb_entry_migration and
+ * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+ * both cases, and because we can't follow correct pages
+ * directly from any kind of swap entries.
+ */
+ if (absent || is_swap_pte(huge_ptep_get(pte)) ||
((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
int ret;
tlb_finish_mmu(&tlb, address, end);
return end;
}
+EXPORT_SYMBOL_GPL(zap_page_range);
/**
* zap_vma_ptes - remove ptes mapping the vma
}
EXPORT_SYMBOL(remap_pfn_range);
+ /**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+ {
+ unsigned long vm_len, pfn, pages;
+
+ /* Check that the physical memory area passed in looks valid */
+ if (start + len < start)
+ return -EINVAL;
+ /*
+ * You *really* shouldn't map things that aren't page-aligned,
+ * but we've historically allowed it because IO memory might
+ * just have smaller alignment.
+ */
+ len += start & ~PAGE_MASK;
+ pfn = start >> PAGE_SHIFT;
+ pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ /* We start the mapping 'vm_pgoff' pages into the area */
+ if (vma->vm_pgoff > pages)
+ return -EINVAL;
+ pfn += vma->vm_pgoff;
+ pages -= vma->vm_pgoff;
+
+ /* Can we fit all of the mapping? */
+ vm_len = vma->vm_end - vma->vm_start;
+ if (vm_len >> PAGE_SHIFT > pages)
+ return -EINVAL;
+
+ /* Ok, let it rip */
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+ }
+ EXPORT_SYMBOL(vm_iomap_memory);
+
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
barrier();
if (pmd_trans_huge(orig_pmd)) {
- if (flags & FAULT_FLAG_WRITE &&
- !pmd_write(orig_pmd) &&
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+ if (dirty && !pmd_write(orig_pmd) &&
!pmd_trans_splitting(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
if (unlikely(ret & VM_FAULT_OOM))
goto retry;
return ret;
+ } else {
+ huge_pmd_set_accessed(mm, vma, address, pmd,
+ orig_pmd, dirty);
}
return 0;
}
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
- if (need_zonelists_rebuild)
- build_all_zonelists(zone);
- else
- zone_pcp_update(zone);
+ if (onlined_pages) {
+ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(zone);
+ else
+ zone_pcp_update(zone);
+ }
mutex_unlock(&zonelists_mutex);
init_per_zone_wmark_min();
- if (onlined_pages) {
+ if (onlined_pages)
kswapd_run(zone_to_nid(zone));
- node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
- }
vm_total_pages = nr_free_pagecache_pages();
nr_pages = end_pfn - start_pfn;
/* set above range as isolated */
- ret = start_isolate_page_range(start_pfn, end_pfn);
+ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
if (ret)
goto out;
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
/* reset pagetype flags and makes migrate type to be MOVABLE */
- undo_isolate_page_range(start_pfn, end_pfn);
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
/* removal success */
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
start_pfn, end_pfn);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn);
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
out:
unlock_memory_hotplug();
*/
static struct prop_descriptor vm_completions;
+/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around. To avoid stressing page reclaim with lots of unreclaimable
+ * pages. It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+ struct zone *z =
+ &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+ x += zone_page_state(z, NR_FREE_PAGES) +
+ zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ }
+ /*
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
+ */
+ return min(x, total);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+static unsigned long determine_dirtyable_memory(void)
+{
+ unsigned long x;
+
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+ dirty_balance_reserve;
+
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+
+ return x + 1; /* Ensure that we never return 0 */
+}
+
/*
* couple the period to the dirty_ratio:
*
return ret;
}
-
int dirty_bytes_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
}
EXPORT_SYMBOL(bdi_set_max_ratio);
-/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around. To avoid stressing page reclaim with lots of unreclaimable
- * pages. It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-
-static unsigned long highmem_dirtyable_memory(unsigned long total)
-{
-#ifdef CONFIG_HIGHMEM
- int node;
- unsigned long x = 0;
-
- for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z);
- }
- /*
- * Make sure that the number of highmem pages is never larger
- * than the number of the total dirtyable memory. This can only
- * occur in very strange VM situations but we want to make sure
- * that this does not occur.
- */
- return min(x, total);
-#else
- return 0;
-#endif
-}
-
-/**
- * determine_dirtyable_memory - amount of memory that may be used
- *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
- */
-unsigned long determine_dirtyable_memory(void)
-{
- unsigned long x;
-
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-
- if (!vm_highmem_is_dirtyable)
- x -= highmem_dirtyable_memory(x);
-
- return x + 1; /* Ensure that we never return 0 */
-}
-
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+ /*
+ * Call this whenever redirtying a page, to de-account the dirty counters
+ * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
+ * control.
+ */
+ void account_page_redirty(struct page *page)
+ {
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ current->nr_dirtied--;
+ dec_zone_page_state(page, NR_DIRTIED);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ }
+ }
+ EXPORT_SYMBOL(account_page_redirty);
+
/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
wbc->pages_skipped++;
+ account_page_redirty(page);
return __set_page_dirty_nobuffers(page);
}
EXPORT_SYMBOL(redirty_page_for_writepage);
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/migrate.h>
+#include <linux/page-debug-flags.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory. This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~GFP_IOFS;
}
+
+bool pm_suspended_storage(void)
+{
+ if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ return false;
+ return true;
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
clear_highpage(page + i);
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
static inline void set_page_order(struct page *page, int order)
{
set_page_private(page, order);
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
+ return 1;
+ }
+
if (PageBuddy(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
return 1;
* free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
- * other. That is, if we allocate a small block, and both were
- * free, the remainder of the region must be split into blocks.
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
- * triggers coalescing into a block of larger size.
+ * triggers coalescing into a block of larger size.
*
* -- wli
*/
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
-
- /* Our buddy is free, merge with it and move up one order. */
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
+ /*
+ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ * merge with it and move up one order.
+ */
+ if (page_is_guard(buddy)) {
+ clear_page_guard_flag(buddy);
+ set_page_private(page, 0);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ } else {
+ list_del(&buddy->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(buddy);
+ }
combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
int i;
int bad = 0;
- trace_mm_page_free_direct(page, order);
+ trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
if (PageAnon(page))
}
}
+#ifdef CONFIG_CMA
+/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+ unsigned i = pageblock_nr_pages;
+ struct page *p = page;
+
+ do {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ } while (++p, --i);
+
+ set_page_refcounted(page);
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+ __free_pages(page, pageblock_order);
+ totalram_pages += pageblock_nr_pages;
+}
+#endif
/*
* The order of subdivision here is critical for the IO subsystem.
high--;
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (high < debug_guardpage_minorder()) {
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ INIT_LIST_HEAD(&page[size].lru);
+ set_page_guard_flag(&page[size]);
+ set_page_private(&page[size], high);
+ /* Guard pages are not available for any usage */
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ continue;
+ }
+#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
-static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
+static int fallbacks[MIGRATE_TYPES][4] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+#ifdef CONFIG_CMA
+ [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
+#else
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+#endif
+ [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
+ [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
};
/*
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
- for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+ for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
- continue;
+ break;
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
* aggressive about taking ownership of free pages
+ *
+ * On the other hand, never change migration
+ * type of MIGRATE_CMA pageblocks nor move CMA
+ * pages on different free lists. We don't
+ * want unmovable pages to be allocated from
+ * MIGRATE_CMA areas.
*/
- if (unlikely(current_order >= (pageblock_order >> 1)) ||
- start_migratetype == MIGRATE_RECLAIMABLE ||
- page_group_by_mobility_disabled) {
- unsigned long pages;
+ if (!is_migrate_cma(migratetype) &&
+ (unlikely(current_order >= pageblock_order / 2) ||
+ start_migratetype == MIGRATE_RECLAIMABLE ||
+ page_group_by_mobility_disabled)) {
+ int pages;
pages = move_freepages_block(zone, page,
start_migratetype);
rmv_page_order(page);
/* Take ownership for orders >= pageblock_order */
- if (current_order >= pageblock_order)
+ if (current_order >= pageblock_order &&
+ !is_migrate_cma(migratetype))
change_pageblock_range(page, current_order,
start_migratetype);
- expand(zone, page, order, current_order, area, migratetype);
+ expand(zone, page, order, current_order, area,
+ is_migrate_cma(migratetype)
+ ? migratetype : start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
return page;
}
-/*
+/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
-static int rmqueue_bulk(struct zone *zone, unsigned int order,
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
{
- int i;
-
+ int mt = migratetype, i;
+
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype);
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
- set_page_private(page, migratetype);
+ if (IS_ENABLED(CONFIG_CMA)) {
+ mt = get_pageblock_migratetype(page);
+ if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+ mt = migratetype;
+ }
+ set_page_private(page, mt);
list = &page->lru;
}
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
local_irq_restore(flags);
}
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ trace_mm_page_free_batched(page, cold);
+ free_hot_cold_page(page, cold);
+ }
+}
+
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
- for (; page < endpage; page += pageblock_nr_pages)
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ for (; page < endpage; page += pageblock_nr_pages) {
+ int mt = get_pageblock_migratetype(page);
+ if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
+ set_pageblock_migratetype(page,
+ MIGRATE_MOVABLE);
+ }
}
return 1 << order;
static int __init fail_page_alloc_debugfs(void)
{
- mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
long min = mark;
int o;
- free_pages -= (1 << order) + 1;
+ free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
return;
/*
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ unsigned long did_some_progress,
unsigned long pages_reclaimed)
{
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
return 0;
+ /* Always retry if specifically requested */
+ if (gfp_mask & __GFP_NOFAIL)
+ return 1;
+
+ /*
+ * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+ * making forward progress without invoking OOM. Suspend also disables
+ * storage devices so kswapd will not help. Bail if we are suspending.
+ */
+ if (!did_some_progress && pm_suspended_storage())
+ return 0;
+
/*
* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
* means __GFP_NOFAIL, but that may not be true in other
if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
return 1;
- /*
- * Don't let big-order allocations loop unless the caller
- * explicitly requests that.
- */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
return 0;
}
}
#endif /* CONFIG_COMPACTION */
-/* The really slow allocator path where we enter direct reclaim */
-static inline struct page *
-__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+/* Perform direct synchronous page reclaim */
+static int
+__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
+ nodemask_t *nodemask)
{
- struct page *page = NULL;
struct reclaim_state reclaim_state;
- bool drained = false;
+ int progress;
cond_resched();
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
- *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+ progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
cond_resched();
+ return progress;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress)
+{
+ struct page *page = NULL;
+ bool drained = false;
+
+ *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ nodemask);
if (unlikely(!(*did_some_progress)))
return NULL;
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+ if (should_alloc_retry(gfp_mask, order, did_some_progress,
+ pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
}
EXPORT_SYMBOL(get_zeroed_page);
-void __pagevec_free(struct pagevec *pvec)
-{
- int i = pagevec_count(pvec);
-
- while (--i >= 0) {
- trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
- free_hot_cold_page(pvec->pages[i], pvec->cold);
- }
-}
-
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
* round what is now in bits to nearest long in bits, then return it in
* bytes.
*/
- static unsigned long __init usemap_size(unsigned long zonesize)
+ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
{
unsigned long usemapsize;
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
}
static void __init setup_usemap(struct pglist_data *pgdat,
- struct zone *zone, unsigned long zonesize)
+ struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long zonesize)
{
- unsigned long usemapsize = usemap_size(zonesize);
+ unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
usemapsize);
}
#else
- static inline void setup_usemap(struct pglist_data *pgdat,
- struct zone *zone, unsigned long zonesize) {}
+ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
-
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
continue;
set_pageblock_order();
- setup_usemap(pgdat, zone, size);
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
if (max > zone->present_pages)
max = zone->present_pages;
reserve_pages += max;
+ /*
+ * Lowmem reserves are not available to
+ * GFP_HIGHUSER page cache allocations and
+ * kswapd tries to balance zones to their high
+ * watermark. As a result, neither should be
+ * regarded as dirtyable memory, to prevent a
+ * situation where reclaim has to clean pages
+ * in order to balance the zones.
+ */
+ zone->dirty_balance_reserve = max;
}
}
+ dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
calculate_totalreserve_pages();
}
-/**
- * setup_per_zone_wmarks - called when min_free_kbytes changes
- * or when memory is hot-{added|removed}
- *
- * Ensures that the watermark[min,low,high] values for each zone are set
- * correctly with respect to min_free_kbytes.
- */
-void setup_per_zone_wmarks(void)
+static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+
+ zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
+ zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
+ zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
+
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
calculate_totalreserve_pages();
}
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+ mutex_lock(&zonelists_mutex);
+ __setup_per_zone_wmarks();
+ mutex_unlock(&zonelists_mutex);
+}
+
/*
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
__count_immobile_pages(struct zone *zone, struct page *page, int count)
{
unsigned long pfn, iter, found;
+ int mt;
+
/*
* For avoiding noise data, lru_add_drain_all() should be called
* If ZONE_MOVABLE, the zone never contains immobile pages
*/
if (zone_idx(zone) == ZONE_MOVABLE)
return true;
-
- if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ mt = get_pageblock_migratetype(page);
+ if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
return true;
pfn = page_to_pfn(page);
return ret;
}
-void unset_migratetype_isolate(struct page *page)
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
goto out;
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- move_freepages_block(zone, page, MIGRATE_MOVABLE);
+ set_pageblock_migratetype(page, migratetype);
+ move_freepages_block(zone, page, migratetype);
out:
spin_unlock_irqrestore(&zone->lock, flags);
}
+#ifdef CONFIG_CMA
+
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+ return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages) - 1);
+}
+
+static unsigned long pfn_max_align_up(unsigned long pfn)
+{
+ return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages));
+}
+
+static struct page *
+__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
+ int **resultp)
+{
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
+}
+
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+{
+ /* This function is based on compact_zone() from compaction.c. */
+
+ unsigned long pfn = start;
+ unsigned int tries = 0;
+ int ret = 0;
+
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+ .sync = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ migrate_prep_local();
+
+ while (pfn < end || !list_empty(&cc.migratepages)) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (list_empty(&cc.migratepages)) {
+ cc.nr_migratepages = 0;
+ pfn = isolate_migratepages_range(cc.zone, &cc,
+ pfn, end);
+ if (!pfn) {
+ ret = -EINTR;
+ break;
+ }
+ tries = 0;
+ } else if (++tries == 5) {
+ ret = ret < 0 ? ret : -EBUSY;
+ break;
+ }
+
+ ret = migrate_pages(&cc.migratepages,
+ __alloc_contig_migrate_alloc,
+ 0, false, MIGRATE_SYNC);
+ }
+
+ putback_lru_pages(&cc.migratepages);
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Update zone's cma pages counter used for watermark level calculation.
+ */
+static inline void __update_cma_watermarks(struct zone *zone, int count)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->min_cma_pages += count;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ setup_per_zone_wmarks();
+}
+
+/*
+ * Trigger memory pressure bump to reclaim some pages in order to be able to
+ * allocate 'count' pages in single page units. Does similar work as
+ *__alloc_pages_slowpath() function.
+ */
+static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zonelist *zonelist = node_zonelist(0, gfp_mask);
+ int did_some_progress = 0;
+ int order = 1;
+
+ /*
+ * Increase level of watermarks to force kswapd do his job
+ * to stabilise at new watermark level.
+ */
+ __update_cma_watermarks(zone, count);
+
+ /* Obey watermarks as if the page was being allocated */
+ while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
+ wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
+
+ did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ NULL);
+ if (!did_some_progress) {
+ /* Exhausted what can be done so it's blamo time */
+ out_of_memory(zonelist, gfp_mask, order, NULL);
+ }
+ }
+
+ /* Restore original watermark levels. */
+ __update_cma_watermarks(zone, -count);
+
+ return count;
+}
+
+/**
+ * alloc_contig_range() -- tries to allocate given range of pages
+ * @start: start PFN to allocate
+ * @end: one-past-the-last PFN to allocate
+ * @migratetype: migratetype of the underlaying pageblocks (either
+ * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
+ * in range must have the same migratetype and it must
+ * be either of the two.
+ *
+ * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
+ * aligned, however it's the caller's responsibility to guarantee that
+ * we are the only thread that changes migrate type of pageblocks the
+ * pages fall in.
+ *
+ * The PFN range must belong to a single zone.
+ *
+ * Returns zero on success or negative error code. On success all
+ * pages which PFN is in [start, end) are allocated for the caller and
+ * need to be freed with free_contig_range().
+ */
+int alloc_contig_range(unsigned long start, unsigned long end,
+ unsigned migratetype)
+{
+ struct zone *zone = page_zone(pfn_to_page(start));
+ unsigned long outer_start, outer_end;
+ int ret = 0, order;
+
+ /*
+ * What we do here is we mark all pageblocks in range as
+ * MIGRATE_ISOLATE. Because pageblock and max order pages may
+ * have different sizes, and due to the way page allocator
+ * work, we align the range to biggest of the two pages so
+ * that page allocator won't try to merge buddies from
+ * different pageblocks and change MIGRATE_ISOLATE to some
+ * other migration type.
+ *
+ * Once the pageblocks are marked as MIGRATE_ISOLATE, we
+ * migrate the pages from an unaligned range (ie. pages that
+ * we are interested in). This will put all the pages in
+ * range back to page allocator as MIGRATE_ISOLATE.
+ *
+ * When this is done, we take the pages in range from page
+ * allocator removing them from the buddy system. This way
+ * page allocator will never consider using them.
+ *
+ * This lets us mark the pageblocks back as
+ * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
+ * aligned range but not in the unaligned, original range are
+ * put back to page allocator so that buddy can use them.
+ */
+
+ ret = start_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype);
+ if (ret)
+ goto done;
+
+ ret = __alloc_contig_migrate_range(start, end);
+ if (ret)
+ goto done;
+
+ /*
+ * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * aligned blocks that are marked as MIGRATE_ISOLATE. What's
+ * more, all pages in [start, end) are free in page allocator.
+ * What we are going to do is to allocate all pages from
+ * [start, end) (that is remove them from page allocator).
+ *
+ * The only problem is that pages at the beginning and at the
+ * end of interesting range may be not aligned with pages that
+ * page allocator holds, ie. they can be part of higher order
+ * pages. Because of this, we reserve the bigger range and
+ * once this is done free the pages we are not interested in.
+ *
+ * We don't have to hold zone->lock here because the pages are
+ * isolated thus they won't get removed from buddy.
+ */
+
+ lru_add_drain_all();
+ drain_all_pages();
+
+ order = 0;
+ outer_start = start;
+ while (!PageBuddy(pfn_to_page(outer_start))) {
+ if (++order >= MAX_ORDER) {
+ ret = -EBUSY;
+ goto done;
+ }
+ outer_start &= ~0UL << order;
+ }
+
+ /* Make sure the range is really isolated. */
+ if (test_pages_isolated(outer_start, end)) {
+ pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
+ outer_start, end);
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /*
+ * Reclaim enough pages to make sure that contiguous allocation
+ * will not starve the system.
+ */
+ __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
+
+ /* Grab isolated pages from freelists. */
+ outer_end = isolate_freepages_range(outer_start, end);
+ if (!outer_end) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /* Free head and tail (if any) */
+ if (start != outer_start)
+ free_contig_range(outer_start, start - outer_start);
+ if (end != outer_end)
+ free_contig_range(end, outer_end - end);
+
+done:
+ undo_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype);
+ return ret;
+}
+
+void free_contig_range(unsigned long pfn, unsigned nr_pages)
+{
+ for (; nr_pages--; ++pfn)
+ __free_page(pfn_to_page(pfn));
+}
+#endif
+
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be isolated before calling this.
unsigned long inodes;
int error = -EINVAL;
+ config.mpol = NULL;
if (shmem_parse_options(data, &config, true))
return error;
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
- mpol_put(sbinfo->mpol);
- sbinfo->mpol = config.mpol; /* transfers initial ref */
+ /*
+ * Preserve previous mempolicy unless mpol remount option was specified.
+ */
+ if (config.mpol) {
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = config.mpol; /* transfers initial ref */
+ }
out:
spin_unlock(&sbinfo->stat_lock);
return error;
vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
+EXPORT_SYMBOL_GPL(shmem_zero_setup);
/**
* shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.