[PATCH RESEND 1/4] cpuset: convert callback

Discussion:

[PATCH RESEND 1/4] cpuset: convert callback_mutex to a spinlock

Vladimir Davydov

2014-10-20 11:50:29 UTC

The callback_mutex is only used to synchronize reads/updates of cpusets'
flags and cpu/node masks. These operations should always proceed fast so
there's no reason why we can't use a spinlock instead of the mutex.

Converting the callback_mutex into a spinlock will let us call
cpuset_zone_allowed_softwall from atomic context. This, in turn, makes
it possible to simplify the code by merging the hardwall and asoftwall
checks into the same function, which is the business of the next patch.

Suggested-by: Zefan Li <***@huawei.com>
Signed-off-by: Vladimir Davydov <***@parallels.com>
Acked-by: Christoph Lameter <***@linux.com>
Acked-by: Zefan Li <***@huawei.com>
---
kernel/cpuset.c | 107 ++++++++++++++++++++++++++++---------------------------
1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..f21ba868f0d1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

/*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * and callback_mutex. The latter may nest inside the former. We also
- * require taking task_lock() when dereferencing a task's cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
*
- * A task must hold both mutexes to modify cpusets. If a task holds
+ * A task must hold both locks to modify cpusets. If a task holds
* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
* just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
*/

static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);

/*
* CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -876,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
continue;
rcu_read_unlock();

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -943,9 +943,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1132,9 +1132,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
continue;
rcu_read_unlock();

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1155,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1202,9 +1202,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1295,9 +1295,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -1713,7 +1713,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
count = seq_get_buf(sf, &buf);
s = buf;

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);

switch (type) {
case FILE_CPULIST:
@@ -1740,7 +1740,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
seq_commit(sf, -1);
}
out_unlock:
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
return ret;
}

@@ -1957,12 +1957,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)

cpuset_inc();

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(cs->css.cgroup)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1989,10 +1989,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -2031,7 +2031,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
mutex_lock(&cpuset_mutex);
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);

if (cgroup_on_dfl(root_css->cgroup)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2042,7 +2042,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}

- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
mutex_unlock(&cpuset_mutex);
}

@@ -2127,12 +2127,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
{
bool is_empty;

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2169,10 +2169,10 @@ hotplug_update_tasks(struct cpuset *cs,
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;

- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);

if (cpus_updated)
update_tasks_cpumask(cs);
@@ -2258,21 +2258,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)

/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}

/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}

@@ -2365,11 +2365,13 @@ void __init cpuset_init_smp(void)

void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- mutex_lock(&callback_mutex);
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
}

void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2415,12 +2417,13 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
+ unsigned long flags;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

return mask;
}
@@ -2439,7 +2442,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
/*
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
* mem_hardwall ancestor to the specified cpuset. Call holding
- * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2481,13 +2484,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_mutex. The
+ * Scanning up parent cpusets requires callback_lock. The
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
* current tasks mems_allowed came up empty on the first pass over
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
- * mutex.
+ * cpuset are short of memory, might require taking the callback_lock.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2514,6 +2516,7 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;

if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
@@ -2533,14 +2536,14 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return 1;

/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);

rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();

- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}

--
1.7.10.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>

Vladimir Davydov

2014-10-20 11:50:30 UTC

Permalink

Current cpuset API for checking if a zone/node is allowed to allocate
from looks rather awkward. We have hardwall and softwall versions of
cpuset_node_allowed with the softwall version doing literally the same
as the hardwall version if __GFP_HARDWALL is passed to it in gfp flags.
If it isn't, the softwall version may check the given node against the
enclosing hardwall cpuset, which it needs to take the callback lock to
do.

Such a distinction was introduced by commit 02a0e53d8227 ("cpuset:
rework cpuset_zone_allowed api"). Before, we had the only version with
the __GFP_HARDWALL flag determining its behavior. The purpose of the
commit was to avoid sleep-in-atomic bugs when someone would mistakenly
call the function without the __GFP_HARDWALL flag for an atomic
allocation. The suffixes introduced were intended to make the callers
think before using the function.

However, since the callback lock was converted from mutex to spinlock by
the previous patch, the softwall check function cannot sleep, and these
precautions are no longer necessary.

So let's simplify the API back to the single check.

Suggested-by: David Rientjes <***@google.com>
Signed-off-by: Vladimir Davydov <***@parallels.com>
Acked-by: Christoph Lameter <***@linux.com>
Acked-by: Zefan Li <***@huawei.com>
---
include/linux/cpuset.h | 37 ++++++--------------------------
kernel/cpuset.c | 55 ++----------------------------------------------
mm/hugetlb.c | 2 +-
mm/oom_kill.c | 2 +-
mm/page_alloc.c | 6 +++---
mm/slab.c | 2 +-
mm/slub.c | 3 ++-
mm/vmscan.c | 5 +++--
8 files changed, 20 insertions(+), 92 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2f073db7392e..1b357997cac5 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

-extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
-extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);

-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
{
- return nr_cpusets() <= 1 ||
- __cpuset_node_allowed_softwall(node, gfp_mask);
+ return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
}

-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
- return nr_cpusets() <= 1 ||
- __cpuset_node_allowed_hardwall(node, gfp_mask);
-}
-
-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
-{
- return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
-}
-
-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
-{
- return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
+ return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
}

extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -179,22 +166,12 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
return 1;
}

-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
-{
- return 1;
-}
-
-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
- return 1;
-}
-
-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
{
return 1;
}

-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
return 1;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f21ba868f0d1..38f7433c1cd2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2453,7 +2453,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}

/**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -2465,13 +2465,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
* (in get_page_from_freelist()) refusing to consider the zones for
@@ -2506,13 +2499,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- * the code that might scan up ancestor cpusets and sleep.
*/
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
@@ -2520,7 +2508,6 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)

if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
/*
@@ -2547,44 +2534,6 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return allowed;
}

-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt. It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
- return 1;
- if (node_isset(node, current->mems_allowed))
- return 1;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
- return 0;
-}
-
/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd722769927..82da930fa3f8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -582,7 +582,7 @@ retry_cpuset:

for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
+ if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
if (avoid_reserve)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bbf405a3a18f..4af376f67198 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -233,7 +233,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask)
- if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
+ if (!cpuset_zone_allowed(zone, gfp_mask))
cpuset_limited = true;

if (cpuset_limited) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 736d8e1b6381..76deeb10317c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1962,7 +1962,7 @@ zonelist_scan:

/*
* Scan zonelist, looking for a zone with enough free.
- * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+ * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
@@ -1973,7 +1973,7 @@ zonelist_scan:
continue;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ !cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* Distribute pages in proportion to the individual
@@ -2506,7 +2506,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
- * comment for __cpuset_node_allowed_softwall().
+ * comment for __cpuset_node_allowed().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
diff --git a/mm/slab.c b/mm/slab.c
index eb2b2ea30130..063a91bc8826 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3012,7 +3012,7 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);

- if (cpuset_zone_allowed_hardwall(zone, flags) &&
+ if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) &&
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
diff --git a/mm/slub.c b/mm/slub.c
index ae7b9f1ad394..7d12f51d9bac 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1662,7 +1662,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,

n = get_node(s, zone_to_nid(zone));

- if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ if (n && cpuset_zone_allowed(zone,
+ flags | __GFP_HARDWALL) &&
n->nr_partial > s->min_partial) {
object = get_partial_node(s, n, c, flags);
if (object) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcb47074ae03..38878b2ab1d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2405,7 +2405,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* to global LRU.
*/
if (global_reclaim(sc)) {
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ if (!cpuset_zone_allowed(zone,
+ GFP_KERNEL | __GFP_HARDWALL))
continue;

lru_pages += zone_reclaimable_pages(zone);
@@ -3388,7 +3389,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!populated_zone(zone))
return;

- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order) {

Vladimir Davydov

2014-10-20 11:50:31 UTC

Permalink

fallback_alloc is called on kmalloc if the preferred node doesn't have
free or partial slabs and there's no pages on the node's free list
(GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
to locate a free or partial slab on other allowed nodes' lists. While
iterating over the preferred node's zonelist it skips those zones which
hardwall cpuset check returns false for. That means that for a task
bound to a specific node using cpusets fallback_alloc will always ignore
free slabs on other nodes and go directly to the reclaimer, which,
however, may allocate from other nodes if cpuset.mem_hardwall is unset
(default). As a result, we may get lists of free slabs grow without
bounds on other nodes, which is bad, because inactive slabs are only
evicted by cache_reap at a very slow rate and cannot be dropped
forcefully.

To reproduce the issue, run a process that will walk over a directory
tree with lots of files inside a cpuset bound to a node that constantly
experiences memory pressure. Look at num_slabs vs active_slabs growth as
reported by /proc/slabinfo.

To avoid this we should use softwall cpuset check in fallback_alloc.

Signed-off-by: Vladimir Davydov <***@parallels.com>
Acked-by: Zefan Li <***@huawei.com>
---
mm/slab.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slab.c b/mm/slab.c
index 063a91bc8826..c44c17478551 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3012,7 +3012,7 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);

- if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) &&
+ if (cpuset_zone_allowed(zone, flags) &&
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,

Vladimir Davydov

2014-10-20 11:50:32 UTC

Permalink

If we fail to allocate from the current node's stock, we look for free
objects on other nodes before calling the page allocator (see
get_any_partial). While checking other nodes we respect cpuset
constraints by calling cpuset_zone_allowed. We enforce hardwall check.
As a result, we will fallback to the page allocator even if there are
some pages cached on other nodes, but the current cpuset doesn't have
them set. However, the page allocator uses softwall check for kernel
allocations, so it may allocate from one of the other nodes in this
case.

Therefore we should use softwall cpuset check in get_any_partial to
conform with the cpuset check in the page allocator.

Signed-off-by: Vladimir Davydov <***@parallels.com>
Acked-by: Zefan Li <***@huawei.com>
---
mm/slub.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 7d12f51d9bac..32ec8fd91bb3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1662,8 +1662,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,

n = get_node(s, zone_to_nid(zone));

- if (n && cpuset_zone_allowed(zone,
- flags | __GFP_HARDWALL) &&
+ if (n && cpuset_zone_allowed(zone, flags) &&
n->nr_partial > s->min_partial) {
object = get_partial_node(s, n, c, flags);
if (object) {