policied. */
enum zone_type policy_zone = 0;
+/*
+ * run-time system-wide default policy => local allocation
+ */
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .mode = MPOL_DEFAULT,
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL,
};
static const struct mempolicy_operations {
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
if (!nodes)
- pol->v.preferred_node = -1; /* local allocation */
+ pol->flags |= MPOL_F_LOCAL; /* local allocation */
else if (nodes_empty(*nodes))
return -EINVAL; /* no allowed nodes */
else
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL;
+ return NULL; /* simply delete any existing policy */
}
VM_BUG_ON(!nodes);
return policy;
}
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+ if (!atomic_dec_and_test(&p->refcnt))
+ return;
+ kmem_cache_free(policy_cache, p);
+}
+
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
if (pol->flags & MPOL_F_STATIC_NODES) {
int node = first_node(pol->w.user_nodemask);
- if (node_isset(node, *nodes))
+ if (node_isset(node, *nodes)) {
pol->v.preferred_node = node;
- else
- pol->v.preferred_node = -1;
+ pol->flags &= ~MPOL_F_LOCAL;
+ } else
+ pol->flags |= MPOL_F_LOCAL;
} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
pol->v.preferred_node = first_node(tmp);
- } else if (pol->v.preferred_node != -1) {
+ } else if (!(pol->flags & MPOL_F_LOCAL)) {
pol->v.preferred_node = node_remap(pol->v.preferred_node,
pol->w.cpuset_mems_allowed,
*nodes);
return 0;
}
-/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
nodes_clear(*nodes);
+ if (p == &default_policy)
+ return;
+
switch (p->mode) {
- case MPOL_DEFAULT:
- break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
- /* or use current node instead of memory_map? */
- if (p->v.preferred_node < 0)
- *nodes = node_states[N_HIGH_MEMORY];
- else
+ if (!(p->flags & MPOL_F_LOCAL))
node_set(p->v.preferred_node, *nodes);
+ /* else return empty node mask for local allocation */
break;
default:
BUG();
}
if (flags & MPOL_F_ADDR) {
+ /*
+ * Do NOT fall back to task policy if the
+ * vma/shared policy at addr is NULL. We
+ * want to return MPOL_DEFAULT in this case.
+ */
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
return -EINVAL;
if (!pol)
- pol = &default_policy;
+ pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
err = -EINVAL;
goto out;
}
- } else
- *policy = pol->mode | pol->flags;
+ } else {
+ *policy = pol == &default_policy ? MPOL_DEFAULT :
+ pol->mode;
+ *policy |= pol->flags;
+ }
if (vma) {
up_read(¤t->mm->mmap_sem);
err = 0;
if (nmask)
- get_zonemask(pol, nmask);
+ get_policy_nodemask(pol, nmask);
out:
+ mpol_cond_put(pol);
if (vma)
up_read(¤t->mm->mmap_sem);
return err;
int err = 0;
nodemask_t tmp;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
err = migrate_vmas(mm, from_nodes, to_nodes, flags);
if (err)
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
- * Returned policy has extra reference count if shared, vma,
- * or some other task's policy [show_numa_maps() can pass
- * @task != current]. It is the caller's responsibility to
- * free the reference in these cases.
+ * Current or other task's task mempolicy and non-shared vma policies
+ * are protected by the task's mmap_sem, which must be held for read by
+ * the caller.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
*/
static struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
- int shared_pol = 0;
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
addr);
if (vpol)
pol = vpol;
- shared_pol = 1; /* if pol non-NULL, add ref below */
- } else if (vma->vm_policy &&
- vma->vm_policy->mode != MPOL_DEFAULT)
+ } else if (vma->vm_policy)
pol = vma->vm_policy;
}
if (!pol)
pol = &default_policy;
- else if (!shared_pol && pol != current->mempolicy)
- mpol_get(pol); /* vma or other task's policy */
return pol;
}
-/* Return a nodemask representing a mempolicy */
-static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
return NULL;
}
-/* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
+/* Return a zonelist indicated by gfp for node representing a mempolicy */
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
{
- int nd;
+ int nd = numa_node_id();
switch (policy->mode) {
case MPOL_PREFERRED:
- nd = policy->v.preferred_node;
- if (nd < 0)
- nd = numa_node_id();
+ if (!(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
break;
case MPOL_BIND:
/*
- * Normally, MPOL_BIND allocations node-local are node-local
- * within the allowed nodemask. However, if __GFP_THISNODE is
- * set and the current node is part of the mask, we use the
- * the zonelist for the first node in the mask instead.
+ * Normally, MPOL_BIND allocations are node-local within the
+ * allowed nodemask. However, if __GFP_THISNODE is set and the
+ * current node is part of the mask, we use the zonelist for
+ * the first node in the mask instead.
*/
- nd = numa_node_id();
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))
nd = first_node(policy->v.nodes);
break;
case MPOL_INTERLEAVE: /* should not happen */
- case MPOL_DEFAULT:
- nd = numa_node_id();
break;
default:
- nd = 0;
BUG();
}
return node_zonelist(nd, gfp);
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
+ * @policy must be protected by freeing by the caller. If @policy is
+ * the current task's mempolicy, this protection is implicit, as only the
+ * task can change it's policy. The system default policy requires no
+ * such protection.
*/
unsigned slab_node(struct mempolicy *policy)
{
- unsigned short pol = policy ? policy->mode : MPOL_DEFAULT;
+ if (!policy || policy->flags & MPOL_F_LOCAL)
+ return numa_node_id();
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * handled MPOL_F_LOCAL above
+ */
+ return policy->v.preferred_node;
- switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
return zone->node;
}
- case MPOL_PREFERRED:
- if (policy->v.preferred_node >= 0)
- return policy->v.preferred_node;
- /* Fall through */
-
default:
- return numa_node_id();
+ BUG();
}
}
* @mpol = pointer to mempolicy pointer for reference counted mempolicy
* @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
*
- * Returns a zonelist suitable for a huge page allocation.
- * If the effective policy is 'BIND, returns pointer to local node's zonelist,
- * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
- * If it is also a policy for which get_vma_policy() returns an extra
- * reference, we must hold that reference until after the allocation.
- * In that case, return policy via @mpol so hugetlb allocation can drop
- * the reference. For non-'BIND referenced policies, we can/do drop the
- * reference here, so the caller doesn't need to know about the special case
- * for default and current task policy.
+ * Returns a zonelist suitable for a huge page allocation and a pointer
+ * to the struct mempolicy for conditional unref after allocation.
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
nodemask_t **nodemask)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
- *mpol = NULL; /* probably no unref needed */
+ *mpol = get_vma_policy(current, vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
- if (pol->mode == MPOL_BIND) {
- *nodemask = &pol->v.nodes;
- } else if (pol->mode == MPOL_INTERLEAVE) {
- unsigned nid;
-
- nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- if (unlikely(pol != &default_policy &&
- pol != current->mempolicy))
- __mpol_put(pol); /* finished with pol */
- return node_zonelist(nid, gfp_flags);
- }
- zl = zonelist_policy(GFP_HIGHUSER, pol);
- if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
- if (pol->mode != MPOL_BIND)
- __mpol_put(pol); /* finished with pol */
- else
- *mpol = pol; /* unref needed after allocation */
+ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+ HPAGE_SHIFT), gfp_flags);
+ } else {
+ zl = policy_zonelist(gfp_flags, *mpol);
+ if ((*mpol)->mode == MPOL_BIND)
+ *nodemask = &(*mpol)->v.nodes;
}
return zl;
}
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
- if (unlikely(pol != &default_policy &&
- pol != current->mempolicy))
- __mpol_put(pol); /* finished with pol */
+ mpol_cond_put(pol);
return alloc_page_interleave(gfp, 0, nid);
}
- zl = zonelist_policy(gfp, pol);
- if (pol != &default_policy && pol != current->mempolicy) {
+ zl = policy_zonelist(gfp, pol);
+ if (unlikely(mpol_needs_cond_ref(pol))) {
/*
- * slow path: ref counted policy -- shared or vma
+ * slow path: ref counted shared policy
*/
struct page *page = __alloc_pages_nodemask(gfp, 0,
- zl, nodemask_policy(gfp, pol));
+ zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
return page;
}
/*
* fast path: default or task policy
*/
- return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
+ return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
}
/**
cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
if (pol->mode == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
return __alloc_pages_nodemask(gfp, order,
- zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
+ policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
return new;
}
+/*
+ * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
+ * eliminate the * MPOL_F_* flags that require conditional ref and
+ * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
+ * after return. Use the returned value.
+ *
+ * Allows use of a mempolicy for, e.g., multiple allocations with a single
+ * policy lookup, even if the policy needs/has extra ref on lookup.
+ * shmem_readahead needs this.
+ */
+struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
+ struct mempolicy *frompol)
+{
+ if (!mpol_needs_cond_ref(frompol))
+ return frompol;
+
+ *tompol = *frompol;
+ tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
+ __mpol_put(frompol);
+ return tompol;
+}
+
static int mpol_match_intent(const struct mempolicy *a,
const struct mempolicy *b)
{
if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
return 0;
switch (a->mode) {
- case MPOL_DEFAULT:
- return 1;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node;
+ return a->v.preferred_node == b->v.preferred_node &&
+ a->flags == b->flags;
default:
BUG();
return 0;
}
}
-/* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
-{
- if (!atomic_dec_and_test(&p->refcnt))
- return;
- p->mode = MPOL_DEFAULT;
- kmem_cache_free(policy_cache, p);
-}
-
/*
* Shared memory backing store policy support.
*
n->start = start;
n->end = end;
mpol_get(pol);
+ pol->flags |= MPOL_F_SHARED; /* for unref */
n->policy = pol;
return n;
}
if (policy != MPOL_DEFAULT) {
struct mempolicy *newpol;
- /* Falls back to MPOL_DEFAULT on any error */
+ /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
newpol = mpol_new(policy, flags, policy_nodes);
if (!IS_ERR(newpol)) {
/* Create pseudo-vma that contains just the policy */
}
/*
- * Display pages allocated per node and memory policy via /proc.
+ * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
+ * Used only for mpol_to_str()
*/
+#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
static const char * const policy_types[] =
- { "default", "prefer", "bind", "interleave" };
+ { "default", "prefer", "bind", "interleave", "local" };
/*
* Convert a mempolicy into a string.
char *p = buffer;
int l;
nodemask_t nodes;
- unsigned short mode = pol ? pol->mode : MPOL_DEFAULT;
+ unsigned short mode;
unsigned short flags = pol ? pol->flags : 0;
+ if (!pol || pol == &default_policy)
+ mode = MPOL_DEFAULT;
+ else
+ mode = pol->mode;
+
switch (mode) {
case MPOL_DEFAULT:
nodes_clear(nodes);
case MPOL_PREFERRED:
nodes_clear(nodes);
- node_set(pol->v.preferred_node, nodes);
+ if (flags & MPOL_F_LOCAL)
+ mode = MPOL_LOCAL; /* pseudo-policy */
+ else
+ node_set(pol->v.preferred_node, nodes);
break;
case MPOL_BIND:
}
l = strlen(policy_types[mode]);
- if (buffer + maxlen < p + l + 1)
- return -ENOSPC;
+ if (buffer + maxlen < p + l + 1)
+ return -ENOSPC;
strcpy(p, policy_types[mode]);
p += l;
- if (flags) {
+ if (flags & MPOL_MODE_FLAGS) {
int need_bar = 0;
if (buffer + maxlen < p + 2)
}
#endif
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
int show_numa_map(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
pol = get_vma_policy(priv->task, vma, vma->vm_start);
mpol_to_str(buffer, sizeof(buffer), pol);
- /*
- * unref shared or other task's mempolicy
- */
- if (pol != &default_policy && pol != current->mempolicy)
- __mpol_put(pol);
+ mpol_cond_put(pol);
seq_printf(m, "%08lx %s", vma->vm_start, buffer);