Discussion:
[PATCH] meminfo: show /proc/meminfo base on container's memcg
Gao feng
2012-05-29 02:56:54 UTC
Permalink
cgroup and namespaces are used for creating containers but some of
information is not isolated/virtualized. This patch is for isolating /proc/meminfo
information per container, which uses memory cgroup. By this, top,free
and other tools under container can work as expected(show container's
usage) without changes.

This patch is a trial to show memcg's info in /proc/meminfo if 'current'
is under a memcg other than root.

we show /proc/meminfo base on container's memory cgroup.
because there are lots of info can't be provide by memcg, and
the cmds such as top, free just use some entries of /proc/meminfo,
we replace those entries by memory cgroup.

if container has no memcg, we will show host's /proc/meminfo
as before.

there is no idea how to deal with Buffers,I just set it zero,
It's strange if Buffers bigger than MemTotal.

Signed-off-by: Gao feng <***@cn.fujitsu.com>
---
fs/proc/meminfo.c | 11 +++++---
include/linux/memcontrol.h | 15 +++++++++++
mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645..29a1fcd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -13,6 +13,7 @@
#include <linux/atomic.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <linux/memcontrol.h>
#include "internal.h"

void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
@@ -27,7 +28,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
struct vmalloc_info vmi;
long cached;
unsigned long pages[NR_LRU_LISTS];
- int lru;

/*
* display in kilobytes.
@@ -39,16 +39,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100) + total_swap_pages;

+ memcg_meminfo(&i);
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages - i.bufferram;
+ /*
+ * If 'current' is in root memory cgroup, returns global status.
+ * If not, returns the status of memcg under which current runs.
+ */
+ sys_page_state(pages, &cached);
if (cached < 0)
cached = 0;

get_vmalloc_info(&vmi);

- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_page_state(NR_LRU_BASE + lru);
-
/*
* Tagged format, for easy grepping and expansion.
*/
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0316197..6220764 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
+#include <linux/mm.h>

struct mem_cgroup;
struct page_cgroup;
@@ -116,6 +117,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);

+extern void memcg_meminfo(struct sysinfo *si);
+extern void sys_page_state(unsigned long *page, long *cached);
+
/*
* For memory reclaim.
*/
@@ -323,6 +327,17 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
{
}

+static inline void memcg_meminfo(struct sysinfo *si)
+{
+}
+
+static inline void sys_page_state(unsigned long *pages, long *cached)
+{
+ int lru;
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f142ea9..c25e160 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -52,6 +52,7 @@
#include "internal.h"
#include <net/sock.h>
#include <net/tcp_memcontrol.h>
+#include <linux/pid_namespace.h>

#include <asm/uaccess.h>

@@ -4345,6 +4346,61 @@ mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
mem_cgroup_get_local_stat(iter, s);
}

+void memcg_meminfo(struct sysinfo *val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_task(current);
+ __kernel_ulong_t totalram, totalswap;
+ if (current->nsproxy->pid_ns == &init_pid_ns ||
+ memcg == NULL || mem_cgroup_is_root(memcg))
+ return;
+
+ totalram = res_counter_read_u64(&memcg->res,
+ RES_LIMIT) >> PAGE_SHIFT;
+ if (totalram < val->totalram) {
+ __kernel_ulong_t usageram;
+ usageram = res_counter_read_u64(&memcg->res,
+ RES_USAGE) >> PAGE_SHIFT;
+ val->totalram = totalram;
+ val->freeram = totalram - usageram;
+ val->bufferram = 0;
+ val->totalhigh = 0;
+ val->freehigh = 0;
+ } else
+ return;
+
+ totalswap = res_counter_read_u64(&memcg->memsw,
+ RES_LIMIT) >> PAGE_SHIFT;
+ if (totalswap < val->totalswap) {
+ __kernel_ulong_t usageswap;
+ usageswap = res_counter_read_u64(&memcg->memsw,
+ RES_USAGE) >> PAGE_SHIFT;
+ val->totalswap = totalswap - val->totalram;
+ val->freeswap = totalswap - usageswap - val->freeram;
+ }
+}
+
+void sys_page_state(unsigned long *pages, long *cached)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_task(current);
+
+ if (current->nsproxy->pid_ns == &init_pid_ns ||
+ memcg == NULL || mem_cgroup_is_root(memcg)) {
+ int lru;
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ } else {
+ struct mcs_total_stat s;
+ memset(&s, 0, sizeof(s));
+ mem_cgroup_get_total_stat(memcg, &s);
+ *cached = s.stat[MCS_CACHE] >> PAGE_SHIFT;
+ pages[LRU_ACTIVE_ANON] = s.stat[MCS_ACTIVE_ANON] >> PAGE_SHIFT;
+ pages[LRU_ACTIVE_FILE] = s.stat[MCS_ACTIVE_FILE] >> PAGE_SHIFT;
+ pages[LRU_INACTIVE_ANON] = s.stat[MCS_INACTIVE_ANON] >> PAGE_SHIFT;
+ pages[LRU_INACTIVE_FILE] = s.stat[MCS_INACTIVE_FILE] >> PAGE_SHIFT;
+ pages[LRU_UNEVICTABLE] = s.stat[MCS_UNEVICTABLE] >> PAGE_SHIFT;
+ }
+}
+
#ifdef CONFIG_NUMA
static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
{
--
1.7.7.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
Glauber Costa
2012-05-29 08:24:23 UTC
Permalink
On 05/29/2012 06:56 AM, Gao feng wrote:
> cgroup and namespaces are used for creating containers but some of
> information is not isolated/virtualized. This patch is for isolating /proc/meminfo
> information per container, which uses memory cgroup. By this, top,free
> and other tools under container can work as expected(show container's
> usage) without changes.
>
> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
> is under a memcg other than root.
>
> we show /proc/meminfo base on container's memory cgroup.
> because there are lots of info can't be provide by memcg, and
> the cmds such as top, free just use some entries of /proc/meminfo,
> we replace those entries by memory cgroup.
>
> if container has no memcg, we will show host's /proc/meminfo
> as before.
>
> there is no idea how to deal with Buffers,I just set it zero,
> It's strange if Buffers bigger than MemTotal.
>
> Signed-off-by: Gao feng<gaofeng-***@public.gmane.org>

This is the very same problem that exists with CPU cgroup.
So I'll tell you what kind of resistance we faced, and why I sort of
agree with it.

In summary, there is no guarantee that current wants to see this. This
is true for containers environments, but doing this unconditionally can
break applications out there.

With cpu is easier to demonstrate, because you would still see all other
tasks in the system (no pid namespaces used), but the tick figures won't
match. But not only memory falls prey to the same issue,
but we really need a common solution to that.

A flag is too ugly, or mount options are too ugly, and when parenting is
in place, hard to get right.

So I've seen a lot of people advocating we should just use a userspace
filesystem that would bind mount that ontop of normal proc.

For instance: bind mount your special meminfo into /proc/meminfo inside
a container. Reads of the later would redirect to the former, that would
then assemble the proper results from the cgroup filesystem, and display it.

I do believe this is a more neutral way to go, and we have all the
tools. It also does not risk breaking anything, since only people that
want it would use it.
David Rientjes
2012-05-30 21:38:25 UTC
Permalink
On Tue, 29 May 2012, Gao feng wrote:

> cgroup and namespaces are used for creating containers but some of
> information is not isolated/virtualized. This patch is for isolating /proc/meminfo
> information per container, which uses memory cgroup. By this, top,free
> and other tools under container can work as expected(show container's
> usage) without changes.
>
> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
> is under a memcg other than root.
>
> we show /proc/meminfo base on container's memory cgroup.
> because there are lots of info can't be provide by memcg, and
> the cmds such as top, free just use some entries of /proc/meminfo,
> we replace those entries by memory cgroup.
>
> if container has no memcg, we will show host's /proc/meminfo
> as before.
>
> there is no idea how to deal with Buffers,I just set it zero,
> It's strange if Buffers bigger than MemTotal.
>
> Signed-off-by: Gao feng <gaofeng-***@public.gmane.org>

Nack, this type of thing was initially tried with cpusets when a thread
was bound to a subset of nodes, i.e. only show the total amount of memory
spanned by those nodes.

For your particular interest, this information is already available
elsewhere: memory.limit_in_bytes and memory.usage_in_bytes and that should
be the interface where this is attained via /proc/cgroups.

Why? Because the information exported by /proc/meminfo is considered by
applications to be static whereas the limit of a memcg may change without
any knowledge of the application. Applications which need to know the
amount of memory they are constrained to are assuming that there are no
other consumers of memory on the system and thus they should be written to
understand memcg limits just like they should understand cpusets (through
either /proc/cgroups or /proc/cpuset).
Kirill A. Shutemov
2012-05-30 23:20:05 UTC
Permalink
On Wed, May 30, 2012 at 02:38:25PM -0700, David Rientjes wrote:
> Why? Because the information exported by /proc/meminfo is considered by
> applications to be static whereas the limit of a memcg may change without
> any knowledge of the application.

Memory hotplug does the same, right?

--
Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
David Rientjes
2012-05-31 00:35:38 UTC
Permalink
On Thu, 31 May 2012, Kirill A. Shutemov wrote:

> > Why? Because the information exported by /proc/meminfo is considered by
> > applications to be static whereas the limit of a memcg may change without
> > any knowledge of the application.
>
> Memory hotplug does the same, right?
>

Memory hotplug is a seperate topic, it changes the amount of physical
memory that is available to the kernel, not any limitation of memory
available to a set of tasks. For memory hot-add, this does not
automatically increase the memory.limit_in_bytes of any non-root memcg,
the memory usage is still constrained as it was before the hotplug event.
Thus, applications would want to depend on memory.{limit,usage}_in_bytes
specifically to determine the amount of available memory even with
CONFIG_MEMORY_HOTPLUG.

Also, under certain cirucmstances such as when a thread is oom killed, it
may allocate memory in excess of its memcg limitation and this wouldn't be
visible as available with this patch via /proc/meminfo. Cpusets allows
softwall allocations even when a thread is simply exiting on all nodes
(and for GFP_ATOMIC allocations) and this also wouldn't be visible in
/proc/meminfo.
Kamezawa Hiroyuki
2012-05-31 00:08:44 UTC
Permalink
(2012/05/31 6:38), David Rientjes wrote:
> On Tue, 29 May 2012, Gao feng wrote:
>
>> cgroup and namespaces are used for creating containers but some of
>> information is not isolated/virtualized. This patch is for isolating /proc/meminfo
>> information per container, which uses memory cgroup. By this, top,free
>> and other tools under container can work as expected(show container's
>> usage) without changes.
>>
>> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
>> is under a memcg other than root.
>>
>> we show /proc/meminfo base on container's memory cgroup.
>> because there are lots of info can't be provide by memcg, and
>> the cmds such as top, free just use some entries of /proc/meminfo,
>> we replace those entries by memory cgroup.
>>
>> if container has no memcg, we will show host's /proc/meminfo
>> as before.
>>
>> there is no idea how to deal with Buffers,I just set it zero,
>> It's strange if Buffers bigger than MemTotal.
>>
>> Signed-off-by: Gao feng<gaofeng-***@public.gmane.org>
>
> Nack, this type of thing was initially tried with cpusets when a thread
> was bound to a subset of nodes, i.e. only show the total amount of memory
> spanned by those nodes.
>

Hmm. How about having memory.meminfo under memory cgroup directory and
use it with bind mount ? (container tools will be able to help it.)
Then, container applications(top,free,etc..) can read the values they wants.
If admins don't want it, they'll not use bind mount.

Thanks,
-Kame
KOSAKI Motohiro
2012-05-31 00:22:16 UTC
Permalink
On Wed, May 30, 2012 at 8:08 PM, Kamezawa Hiroyuki
<kamezawa.hiroyu-+***@public.gmane.org> wrote:
> (2012/05/31 6:38), David Rientjes wrote:
>>
>> On Tue, 29 May 2012, Gao feng wrote:
>>
>>> cgroup and namespaces are used for creating containers but some of
>>> information is not isolated/virtualized. This patch is for isolating
>>> /proc/meminfo
>>> information per container, which uses memory cgroup. By this, top,free
>>> and other tools under container can work as expected(show container's
>>> usage) without changes.
>>>
>>> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
>>> is under a memcg other than root.
>>>
>>> we show /proc/meminfo base on container's memory cgroup.
>>> because there are lots of info can't be provide by memcg, and
>>> the cmds such as top, free just use some entries of /proc/meminfo,
>>> we replace those entries by memory cgroup.
>>>
>>> if container has no memcg, we will show host's /proc/meminfo
>>> as before.
>>>
>>> there is no idea how to deal with Buffers,I just set it zero,
>>> It's strange if Buffers bigger than MemTotal.
>>>
>>> Signed-off-by: Gao feng<gaofeng-***@public.gmane.org>
>>
>>
>> Nack, this type of thing was initially tried with cpusets when a thread
>> was bound to a subset of nodes, i.e. only show the total amount of memory
>> spanned by those nodes.
>>
>
> Hmm. How about having memory.meminfo under memory cgroup directory and
> use it with bind mount ? (container tools will be able to help it.)
> Then, container applications(top,free,etc..) can read the values they wants.
> If admins don't want it, they'll not use bind mount.

+1. 50% users need namespace separation and others don't. We need a
selectability.
Kamezawa Hiroyuki
2012-05-31 00:33:02 UTC
Permalink
(2012/05/31 9:22), KOSAKI Motohiro wrote:
> On Wed, May 30, 2012 at 8:08 PM, Kamezawa Hiroyuki
> <kamezawa.hiroyu-+***@public.gmane.org> wrote:
>> (2012/05/31 6:38), David Rientjes wrote:
>>>
>>> On Tue, 29 May 2012, Gao feng wrote:
>>>
>>>> cgroup and namespaces are used for creating containers but some of
>>>> information is not isolated/virtualized. This patch is for isolating
>>>> /proc/meminfo
>>>> information per container, which uses memory cgroup. By this, top,free
>>>> and other tools under container can work as expected(show container's
>>>> usage) without changes.
>>>>
>>>> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
>>>> is under a memcg other than root.
>>>>
>>>> we show /proc/meminfo base on container's memory cgroup.
>>>> because there are lots of info can't be provide by memcg, and
>>>> the cmds such as top, free just use some entries of /proc/meminfo,
>>>> we replace those entries by memory cgroup.
>>>>
>>>> if container has no memcg, we will show host's /proc/meminfo
>>>> as before.
>>>>
>>>> there is no idea how to deal with Buffers,I just set it zero,
>>>> It's strange if Buffers bigger than MemTotal.
>>>>
>>>> Signed-off-by: Gao feng<gaofeng-***@public.gmane.org>
>>>
>>>
>>> Nack, this type of thing was initially tried with cpusets when a thread
>>> was bound to a subset of nodes, i.e. only show the total amount of memory
>>> spanned by those nodes.
>>>
>>
>> Hmm. How about having memory.meminfo under memory cgroup directory and
>> use it with bind mount ? (container tools will be able to help it.)
>> Then, container applications(top,free,etc..) can read the values they wants.
>> If admins don't want it, they'll not use bind mount.
>
> +1. 50% users need namespace separation and others don't. We need a
> selectability.

My test with sysfs node's meminfo seems to work...

[***@rx100-1 qqm]# mount --bind /sys/devices/system/node/node0/meminfo /proc/meminfo
[***@rx100-1 qqm]# cat /proc/meminfo

Node 0 MemTotal: 8379636 kB
Node 0 MemFree: 4050224 kB
Node 0 MemUsed: 4329412 kB
Node 0 Active: 3010876 kB
Node 0 Inactive: 507480 kB
Node 0 Active(anon): 2671920 kB
Node 0 Inactive(anon): 111596 kB
Node 0 Active(file): 338956 kB
Node 0 Inactive(file): 395884 kB
Node 0 Unevictable: 48316 kB
Node 0 Mlocked: 11524 kB
Node 0 Dirty: 8 kB
Node 0 Writeback: 0 kB
Node 0 FilePages: 744908 kB
Node 0 Mapped: 20604 kB
Node 0 AnonPages: 1344940 kB
Node 0 Shmem: 1448 kB
Node 0 KernelStack: 3528 kB
Node 0 PageTables: 53840 kB
Node 0 NFS_Unstable: 0 kB
Node 0 Bounce: 0 kB
Node 0 WritebackTmp: 0 kB
Node 0 Slab: 184404 kB
Node 0 SReclaimable: 131060 kB
Node 0 SUnreclaim: 53344 kB
Node 0 HugePages_Total: 0
Node 0 HugePages_Free: 0
Node 0 HugePages_Surp: 0

Thanks,
-Kame
David Rientjes
2012-05-31 00:44:43 UTC
Permalink
On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:

> My test with sysfs node's meminfo seems to work...
>
> [***@rx100-1 qqm]# mount --bind /sys/devices/system/node/node0/meminfo
> /proc/meminfo
> [***@rx100-1 qqm]# cat /proc/meminfo
>
> Node 0 MemTotal: 8379636 kB

This doesn't seem like a good idea unless the application supports the
"Node 0" prefix in /proc/meminfo.

If any application really cares about the amount of memory available to
it, it should be taught to be memcg aware. Then do something like

cat $(grep memory /proc/mounts | cut -d " " -f 2)/$(grep memory /proc/self/cgroup | cut -d : -f 3)/memory.limit_in_bytes

but since that value can change all the time then it doesn't seem helpful
unless we have a userspace notifier.
Kamezawa Hiroyuki
2012-05-31 00:53:37 UTC
Permalink
(2012/05/31 9:44), David Rientjes wrote:
> On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:
>
>> My test with sysfs node's meminfo seems to work...
>>
>> [***@rx100-1 qqm]# mount --bind /sys/devices/system/node/node0/meminfo
>> /proc/meminfo
>> [***@rx100-1 qqm]# cat /proc/meminfo
>>
>> Node 0 MemTotal: 8379636 kB
>
> This doesn't seem like a good idea unless the application supports the
> "Node 0" prefix in /proc/meminfo.
>
Of course, /cgroup/memory/..../memory.meminfo , a new file, will use the same
format of /proc/meminfo. Above is just an example of bind-mount.

> If any application really cares about the amount of memory available to
> it, it should be taught to be memcg aware. Then do something like
>

Considering the container as a kind of virtualization, Some kind of
transparent way is required.

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
David Rientjes
2012-05-31 01:31:41 UTC
Permalink
On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:

> > > My test with sysfs node's meminfo seems to work...
> > >
> > > [***@rx100-1 qqm]# mount --bind /sys/devices/system/node/node0/meminfo
> > > /proc/meminfo
> > > [***@rx100-1 qqm]# cat /proc/meminfo
> > >
> > > Node 0 MemTotal: 8379636 kB
> >
> > This doesn't seem like a good idea unless the application supports the
> > "Node 0" prefix in /proc/meminfo.
> >
> Of course, /cgroup/memory/..../memory.meminfo , a new file, will use the same
> format of /proc/meminfo. Above is just an example of bind-mount.
>

It's not just a memcg issue, it would also be a cpusets issue.
Kamezawa Hiroyuki
2012-05-31 02:33:37 UTC
Permalink
(2012/05/31 10:31), David Rientjes wrote:
> On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:
>
>>>> My test with sysfs node's meminfo seems to work...
>>>>
>>>> [***@rx100-1 qqm]# mount --bind /sys/devices/system/node/node0/meminfo
>>>> /proc/meminfo
>>>> [***@rx100-1 qqm]# cat /proc/meminfo
>>>>
>>>> Node 0 MemTotal: 8379636 kB
>>>
>>> This doesn't seem like a good idea unless the application supports the
>>> "Node 0" prefix in /proc/meminfo.
>>>
>> Of course, /cgroup/memory/..../memory.meminfo , a new file, will use the same
>> format of /proc/meminfo. Above is just an example of bind-mount.
>>
>
> It's not just a memcg issue, it would also be a cpusets issue.

I think you can add cpuset.meminfo.

Thanks,
-Kame
David Rientjes
2012-05-31 05:02:23 UTC
Permalink
On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:

> > It's not just a memcg issue, it would also be a cpusets issue.
>
> I think you can add cpuset.meminfo.
>

It's simple to find the same information by reading the per-node meminfo
files in sysfs for each of the allowed cpuset mems. This is why this
approach has been nacked in the past, specifically by Paul Jackson when he
implemented cpusets.

The bottomline is that /proc/meminfo is one of many global resource state
interfaces and doesn't imply that every thread has access to the full
resources. It never has. It's very simple for another thread to consume
a large amount of memory as soon as your read() of /proc/meminfo completes
and then that information is completely bogus. We also don't want to
virtualize every single global resource state interface, it would be never
ending.

Applications that administer memory cgroups or cpusets can get this
information very easily, each application within those memory cgroups or
cpusets does not need it and should not rely on it: it provides no
guarantee about future usage nor notifies the application when the amount
of free memory changes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
Kamezawa Hiroyuki
2012-05-31 05:36:21 UTC
Permalink
(2012/05/31 14:02), David Rientjes wrote:
> On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:
>
>>> It's not just a memcg issue, it would also be a cpusets issue.
>>
>> I think you can add cpuset.meminfo.
>>
>
> It's simple to find the same information by reading the per-node meminfo
> files in sysfs for each of the allowed cpuset mems. This is why this
> approach has been nacked in the past, specifically by Paul Jackson when he
> implemented cpusets.
>

I don't think there was a discussion of LXC in that era.

> The bottomline is that /proc/meminfo is one of many global resource state
> interfaces and doesn't imply that every thread has access to the full
> resources. It never has. It's very simple for another thread to consume
> a large amount of memory as soon as your read() of /proc/meminfo completes
> and then that information is completely bogus.

Why you need to discuss this here ? We know all information are snapshot.

> We also don't want to
> virtualize every single global resource state interface, it would be never
> ending.
>
Just doing one by one. It will end.

> Applications that administer memory cgroups or cpusets can get this
> information very easily, each application within those memory cgroups or
> cpusets does not need it and should not rely on it: it provides no
> guarantee about future usage nor notifies the application when the amount
> of free memory changes.

If so, the admin should have know-how to get the information from the inside
of the container. If container is well-isolated, he'll need some
trick to get its own cgroup information from the inside of containers.

Hmm....maybe need to mount cgroup in the container (again) and get an access to cgroup
hierarchy and find the cgroup it belongs to......if it's allowed. I don't want to allow
it and disable it with capability or some other check. Another idea is to exchange
information by some network connection with daemon in root cgroup, like qemu-ga.
And free, top, ....misc applications should support it. It doesn't seem easy.

It may be better to think of supporting yet another FUSE procfs, which will work
with libvirt in userland if having it in the kernel is complicated.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
David Rientjes
2012-05-31 06:17:51 UTC
Permalink
On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:

> > The bottomline is that /proc/meminfo is one of many global resource state
> > interfaces and doesn't imply that every thread has access to the full
> > resources. It never has. It's very simple for another thread to consume
> > a large amount of memory as soon as your read() of /proc/meminfo completes
> > and then that information is completely bogus.
>
> Why you need to discuss this here ? We know all information are snapshot.
>

MemTotal is usually assumed to be static from /proc/meminfo and could now
change radically without notification to the application.

> Hmm....maybe need to mount cgroup in the container (again) and get an access
> to cgroup
> hierarchy and find the cgroup it belongs to......if it's allowed.

An application should always know the cgroup that its attached to and be
able to read its state using the command that I gave earlier.
KOSAKI Motohiro
2012-05-31 06:23:26 UTC
Permalink
(5/31/12 2:17 AM), David Rientjes wrote:
> On Thu, 31 May 2012, Kamezawa Hiroyuki wrote:
>
>>> The bottomline is that /proc/meminfo is one of many global resource state
>>> interfaces and doesn't imply that every thread has access to the full
>>> resources. It never has. It's very simple for another thread to consume
>>> a large amount of memory as soon as your read() of /proc/meminfo completes
>>> and then that information is completely bogus.
>>
>> Why you need to discuss this here ? We know all information are snapshot.
>>
>
> MemTotal is usually assumed to be static from /proc/meminfo and could now
> change radically without notification to the application.
>
>> Hmm....maybe need to mount cgroup in the container (again) and get an access
>> to cgroup
>> hierarchy and find the cgroup it belongs to......if it's allowed.
>
> An application should always know the cgroup that its attached to and be
> able to read its state using the command that I gave earlier.

No. you don't need why userland folks want namespaces. Even though you don't
need namespaces. It doesn't good reason to refuse another use case.
David Rientjes
2012-05-31 06:28:54 UTC
Permalink
On Thu, 31 May 2012, KOSAKI Motohiro wrote:

> > An application should always know the cgroup that its attached to and be
> > able to read its state using the command that I gave earlier.
>
> No. you don't need why userland folks want namespaces. Even though you don't
> need namespaces. It doesn't good reason to refuse another use case.
>

This is tangent to the discussion, we need to revisit why an application
other than a daemon managing a set of memcgs would ever need to know the
information in /proc/meminfo. No use-case was ever presented in the
changelog and its not clear how this is at all relevant. So before
changing the kernel, please describe how this actually matters in a real-
world scenario.
KOSAKI Motohiro
2012-05-31 06:37:25 UTC
Permalink
(5/31/12 2:28 AM), David Rientjes wrote:
> On Thu, 31 May 2012, KOSAKI Motohiro wrote:
>
>>> An application should always know the cgroup that its attached to and be
>>> able to read its state using the command that I gave earlier.
>>
>> No. you don't need why userland folks want namespaces. Even though you don't
>> need namespaces. It doesn't good reason to refuse another use case.
>>
>
> This is tangent to the discussion, we need to revisit why an application
> other than a daemon managing a set of memcgs would ever need to know the
> information in /proc/meminfo. No use-case was ever presented in the
> changelog and its not clear how this is at all relevant. So before
> changing the kernel, please describe how this actually matters in a real-
> world scenario.

Huh? Don't you know a meanings of a namespace ISOLATION? isolation mean,
isolated container shouldn't be able to access global information. If you
want to lean container/namespace concept, tasting openvz or solaris container
is a good start.

But anyway, I dislike current implementaion. So, I NAK this patch too.
David Rientjes
2012-05-31 06:56:13 UTC
Permalink
On Thu, 31 May 2012, KOSAKI Motohiro wrote:

> > This is tangent to the discussion, we need to revisit why an application
> > other than a daemon managing a set of memcgs would ever need to know the
> > information in /proc/meminfo. No use-case was ever presented in the
> > changelog and its not clear how this is at all relevant. So before
> > changing the kernel, please describe how this actually matters in a real-
> > world scenario.
>
> Huh? Don't you know a meanings of a namespace ISOLATION? isolation mean,
> isolated container shouldn't be able to access global information. If you
> want to lean container/namespace concept, tasting openvz or solaris container
> is a good start.
>

As I said, LXC and namespace isolation is a tangent to the discussion of
faking the /proc/meminfo for the memcg context of a thread.

> But anyway, I dislike current implementaion. So, I NAK this patch too.
>

I'm glad you reached that conclusion, but I think you did so for a much
different (although unspecified) reason.

Thanks.
KOSAKI Motohiro
2012-05-31 07:09:45 UTC
Permalink
On Thu, May 31, 2012 at 2:56 AM, David Rientjes <rientjes-hpIqsD4AKlfQT0dZR+***@public.gmane.org> wrote:
> On Thu, 31 May 2012, KOSAKI Motohiro wrote:
>
>> > This is tangent to the discussion, we need to revisit why an application
>> > other than a daemon managing a set of memcgs would ever need to know the
>> > information in /proc/meminfo.  No use-case was ever presented in the
>> > changelog and its not clear how this is at all relevant.  So before
>> > changing the kernel, please describe how this actually matters in a real-
>> > world scenario.
>>
>> Huh? Don't you know a meanings of a namespace ISOLATION? isolation mean,
>> isolated container shouldn't be able to access global information. If you
>> want to lean container/namespace concept, tasting openvz or solaris container
>> is a good start.
>
> As I said, LXC and namespace isolation is a tangent to the discussion of
> faking the /proc/meminfo for the memcg context of a thread.

Because of, /proc/meminfo affect a lot of libraries behavior. So, it's not only
application issue. If you can't rewrite _all_ of userland assets, fake meminfo
can't be escaped. Again see alternative container implementation.


>
>> But anyway, I dislike current implementaion. So, I NAK this patch too.
>>
>
> I'm glad you reached that conclusion, but I think you did so for a much
> different (although unspecified) reason.
>
> Thanks.
David Rientjes
2012-05-31 07:35:49 UTC
Permalink
On Thu, 31 May 2012, KOSAKI Motohiro wrote:

> > As I said, LXC and namespace isolation is a tangent to the discussion of
> > faking the /proc/meminfo for the memcg context of a thread.
>
> Because of, /proc/meminfo affect a lot of libraries behavior. So, it's not only
> application issue. If you can't rewrite _all_ of userland assets, fake meminfo
> can't be escaped. Again see alternative container implementation.
>

It's a tangent because it isn't a complete psuedo /proc/meminfo for all
threads attached to a memcg regardless of any namespace isolation; the LXC
solution has existed for a couple of years by its procfs patchset that
overlaps procfs with fuse and can suppress or modify any output in the
context of a memory controller using things like
memory.{limit,usage}_in_bytes. I'm sure all other fields could be
modified if outputted in some structured way via memcg; it looks like
memory.stat would need to be extended to provide that. If that's mounted
prior to executing the application, then your isolation is achieved and
all libraries should see the new output that you've defined in LXC.

However, this seems like a seperate topic than the patch at hand which
does this directly to /proc/meminfo based on a thread's memcg context,
that's the part that I'm nacking. I'd recommend to Gao to expose this
information via memory.stat and then use fuse and the procfs lxc support
as your way of contextualizing the resources.
KOSAKI Motohiro
2012-05-31 07:42:38 UTC
Permalink
(5/31/12 3:35 AM), David Rientjes wrote:
> On Thu, 31 May 2012, KOSAKI Motohiro wrote:
>
>>> As I said, LXC and namespace isolation is a tangent to the discussion of
>>> faking the /proc/meminfo for the memcg context of a thread.
>>
>> Because of, /proc/meminfo affect a lot of libraries behavior. So, it's not only
>> application issue. If you can't rewrite _all_ of userland assets, fake meminfo
>> can't be escaped. Again see alternative container implementation.
>>
>
> It's a tangent because it isn't a complete psuedo /proc/meminfo for all
> threads attached to a memcg regardless of any namespace isolation; the LXC
> solution has existed for a couple of years by its procfs patchset that
> overlaps procfs with fuse and can suppress or modify any output in the
> context of a memory controller using things like
> memory.{limit,usage}_in_bytes. I'm sure all other fields could be
> modified if outputted in some structured way via memcg; it looks like
> memory.stat would need to be extended to provide that. If that's mounted
> prior to executing the application, then your isolation is achieved and
> all libraries should see the new output that you've defined in LXC.
>
> However, this seems like a seperate topic than the patch at hand which
> does this directly to /proc/meminfo based on a thread's memcg context,
> that's the part that I'm nacking.

Then, I NAKed current patch too. Yeah, current one is ugly. It assume _all_
user need namespace isolation and it clearly is not.


> I'd recommend to Gao to expose this
> information via memory.stat and then use fuse and the procfs lxc support
> as your way of contextualizing the resources.

It's one of a option. But, I seriously doubt fuse can make simpler than kamezawa-san's
idea. But yeah, I might NACK kamezawa-san's one if he will post ugly patch.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
Johannes Weiner
2012-05-31 07:57:18 UTC
Permalink
On Thu, May 31, 2012 at 03:42:38AM -0400, KOSAKI Motohiro wrote:
> (5/31/12 3:35 AM), David Rientjes wrote:
> >On Thu, 31 May 2012, KOSAKI Motohiro wrote:
> >
> >>>As I said, LXC and namespace isolation is a tangent to the discussion of
> >>>faking the /proc/meminfo for the memcg context of a thread.
> >>
> >>Because of, /proc/meminfo affect a lot of libraries behavior. So, it's not only
> >>application issue. If you can't rewrite _all_ of userland assets, fake meminfo
> >>can't be escaped. Again see alternative container implementation.
> >>
> >
> >It's a tangent because it isn't a complete psuedo /proc/meminfo for all
> >threads attached to a memcg regardless of any namespace isolation; the LXC
> >solution has existed for a couple of years by its procfs patchset that
> >overlaps procfs with fuse and can suppress or modify any output in the
> >context of a memory controller using things like
> >memory.{limit,usage}_in_bytes. I'm sure all other fields could be
> >modified if outputted in some structured way via memcg; it looks like
> >memory.stat would need to be extended to provide that. If that's mounted
> >prior to executing the application, then your isolation is achieved and
> >all libraries should see the new output that you've defined in LXC.
> >
> >However, this seems like a seperate topic than the patch at hand which
> >does this directly to /proc/meminfo based on a thread's memcg context,
> >that's the part that I'm nacking.
>
> Then, I NAKed current patch too. Yeah, current one is ugly. It assume _all_
> user need namespace isolation and it clearly is not.

Actually, it only chooses the memcg version for tasks that are not in
the init pid namespace. Tying this to the pid namespace is a bit
ugly, but would probably end up doing the right thing most of the
time. A separate namespace would be better.
Gao feng
2012-05-31 07:58:41 UTC
Permalink
=E4=BA=8E 2012=E5=B9=B405=E6=9C=8831=E6=97=A5 15:42, KOSAKI Motohiro =E5=
=86=99=E9=81=93:
> (5/31/12 3:35 AM), David Rientjes wrote:
>> On Thu, 31 May 2012, KOSAKI Motohiro wrote:
>>
>>>> As I said, LXC and namespace isolation is a tangent to the discuss=
ion of
>>>> faking the /proc/meminfo for the memcg context of a thread.
>>>
>>> Because of, /proc/meminfo affect a lot of libraries behavior. So, i=
t's not only
>>> application issue. If you can't rewrite _all_ of userland assets, f=
ake meminfo
>>> can't be escaped. Again see alternative container implementation.
>>>
>>
>> It's a tangent because it isn't a complete psuedo /proc/meminfo for =
all
>> threads attached to a memcg regardless of any namespace isolation; t=
he LXC
>> solution has existed for a couple of years by its procfs patchset th=
at
>> overlaps procfs with fuse and can suppress or modify any output in t=
he
>> context of a memory controller using things like
>> memory.{limit,usage}_in_bytes. I'm sure all other fields could be
>> modified if outputted in some structured way via memcg; it looks lik=
e
>> memory.stat would need to be extended to provide that. If that's mo=
unted
>> prior to executing the application, then your isolation is achieved =
and
>> all libraries should see the new output that you've defined in LXC.
>>
>> However, this seems like a seperate topic than the patch at hand whi=
ch
>> does this directly to /proc/meminfo based on a thread's memcg contex=
t,
>> that's the part that I'm nacking.
>=20
> Then, I NAKed current patch too. Yeah, current one is ugly. It assume=
_all_
> user need namespace isolation and it clearly is not.
>=20
>=20
>> I'd recommend to Gao to expose this
>> information via memory.stat and then use fuse and the procfs lxc sup=
port
>> as your way of contextualizing the resources.
>=20
> It's one of a option. But, I seriously doubt fuse can make simpler th=
an kamezawa-san's
> idea. But yeah, I might NACK kamezawa-san's one if he will post ugly =
patch.
>=20

It seams I should do some homework to make the implement beautifully.

I think kamezawa-san's idea is more simpler.
thanks for your advice.

>=20
> --=20
> To unsubscribe from this list: send the line "unsubscribe linux-kerne=
l" in
> the body of a message to majordomo-***@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>=20
Glauber Costa
2012-05-31 08:32:36 UTC
Permalink
On 05/31/2012 11:58 AM, Gao feng wrote:
>> > It's one of a option. But, I seriously doubt fuse can make simpler than kamezawa-san's
>> > idea. But yeah, I might NACK kamezawa-san's one if he will post ugly patch.
>> >
> It seams I should do some homework to make the implement beautifully.
>
> I think kamezawa-san's idea is more simpler.
> thanks for your advice.
>

One think to keep in mind: A file in memcg does not need to follow the
same format of /proc/meminfo so we can bind mount. We should be able to
reconstruct that in userspace based on information available from the
kernel. You can even collect that from multiple locations, and *then*
you bind mount.

It helps to keep the churn out of the kernel, and in case of meminfo,
you might need no extra kernel patches at all. And in the case of other
files like /proc/stat, the relevant information comes from more than one
cgroup anyway, so there is not too much way around it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
Kamezawa Hiroyuki
2012-05-31 08:51:28 UTC
Permalink
(2012/05/31 17:32), Glauber Costa wrote:
> On 05/31/2012 11:58 AM, Gao feng wrote:
>>> > It's one of a option. But, I seriously doubt fuse can make simpler than kamezawa-san's
>>> > idea. But yeah, I might NACK kamezawa-san's one if he will post ugly patch.
>>> >
>> It seams I should do some homework to make the implement beautifully.
>>
>> I think kamezawa-san's idea is more simpler.
>> thanks for your advice.
>>
>
> One think to keep in mind: A file in memcg does not need to follow the same format
> of /proc/meminfo so we can bind mount. We should be able to reconstruct that in
> userspace based on information available from the kernel. You can even collect that
>from multiple locations, and *then* you bind mount.

I'm sorry I couldn't fully understand. Could you explain more ?
Do you mean
- bind mount memory cgroup directory into the container for exporting information
- Some user-space apps, FUSE-procfs or some, can provide enough information

?
Thanks,
-Kame
Glauber Costa
2012-05-31 08:59:33 UTC
Permalink
On 05/31/2012 12:51 PM, Kamezawa Hiroyuki wrote:
>> One think to keep in mind: A file in memcg does not need to follow the
>> same format
>> of /proc/meminfo so we can bind mount. We should be able to
>> reconstruct that in
>> userspace based on information available from the kernel. You can
>> even collect that
>> from multiple locations, and *then* you bind mount.
>
> I'm sorry I couldn't fully understand. Could you explain more ?
> Do you mean
> - bind mount memory cgroup directory into the container for exporting
> information
> - Some user-space apps, FUSE-procfs or some, can provide enough
> information
>

Implementation details aside, the idea is to have something like FUSE to
hook the read(), and then construct the information it needs to present
in the proper format.

Alternatively, for files that doesn't change a lot, you can create a
file /container-storage-area/my_copy_of_meminfo at container creation,
and bind mount *that* file.

There is no reason to bind mount a kernel-provided file directly.
Gao feng
2012-05-31 08:55:31 UTC
Permalink
于 2012年05月31日 16:32, Glauber Costa 写道:
> On 05/31/2012 11:58 AM, Gao feng wrote:
>>> > It's one of a option. But, I seriously doubt fuse can make simpler than kamezawa-san's
>>> > idea. But yeah, I might NACK kamezawa-san's one if he will post ugly patch.
>>> >
>> It seams I should do some homework to make the implement beautifully.
>>
>> I think kamezawa-san's idea is more simpler.
>> thanks for your advice.
>>
>
> One think to keep in mind: A file in memcg does not need to follow the same format of /proc/meminfo so we can bind mount. We should be able to reconstruct that in userspace based on information
> available from the kernel. You can even collect that from multiple locations, and *then* you bind mount.
>
> It helps to keep the churn out of the kernel, and in case of meminfo, you might need no extra kernel patches at all. And in the case of other files like /proc/stat, the relevant information comes from
> more than one cgroup anyway, so there is not too much way around it.

I got it,thank you very much,indeed we need no extra kernel patch at all.
Maybe we should do this work in lxc or libvirt.

thanks Glauber!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to ***@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"***@kvack.org"> ***@kvack.org </a>
Glauber Costa
2012-05-31 08:56:50 UTC
Permalink
>>>
>>
>> One think to keep in mind: A file in memcg does not need to follow the same format of /proc/meminfo so we can bind mount. We should be able to reconstruct that in userspace based on information
>> available from the kernel. You can even collect that from multiple locations, and *then* you bind mount.
>>
>> It helps to keep the churn out of the kernel, and in case of meminfo, you might need no extra kernel patches at all. And in the case of other files like /proc/stat, the relevant information comes from
>> more than one cgroup anyway, so there is not too much way around it.
>
> I got it,thank you very much,indeed we need no extra kernel patch at all.
> Maybe we should do this work in lxc or libvirt.
>
> thanks Glauber!
>

lxc has a fuse overlay for /proc already. I can't tell you about the
state of that, because I haven't looked at it in details yet. I need to
do something a lot similar for /proc/stat, but that is currently down in
my prio queue.

But it seems to be the way to go. My only concern is whether or not it
is usable outside of lxc. Other Container solutions like OpenVZ would
benefit from this a lot.
Kamezawa Hiroyuki
2012-05-31 07:07:38 UTC
Permalink
(2012/05/31 15:37), KOSAKI Motohiro wrote:
> (5/31/12 2:28 AM), David Rientjes wrote:
>> On Thu, 31 May 2012, KOSAKI Motohiro wrote:
>>
>>>> An application should always know the cgroup that its attached to and be
>>>> able to read its state using the command that I gave earlier.
>>>
>>> No. you don't need why userland folks want namespaces. Even though you don't
>>> need namespaces. It doesn't good reason to refuse another use case.
>>>
>>
>> This is tangent to the discussion, we need to revisit why an application
>> other than a daemon managing a set of memcgs would ever need to know the
>> information in /proc/meminfo. No use-case was ever presented in the
>> changelog and its not clear how this is at all relevant. So before
>> changing the kernel, please describe how this actually matters in a real-
>> world scenario.
>
> Huh? Don't you know a meanings of a namespace ISOLATION? isolation mean,
> isolated container shouldn't be able to access global information. If you
> want to lean container/namespace concept, tasting openvz or solaris container
> is a good start.
>
> But anyway, I dislike current implementaion. So, I NAK this patch too.

Could you give us advice for improving this ? What idea do you have ?

Thanks,
-Kame
KOSAKI Motohiro
2012-05-31 07:23:00 UTC
Permalink
(5/31/12 3:07 AM), Kamezawa Hiroyuki wrote:
> (2012/05/31 15:37), KOSAKI Motohiro wrote:
>> (5/31/12 2:28 AM), David Rientjes wrote:
>>> On Thu, 31 May 2012, KOSAKI Motohiro wrote:
>>>
>>>>> An application should always know the cgroup that its attached to and be
>>>>> able to read its state using the command that I gave earlier.
>>>>
>>>> No. you don't need why userland folks want namespaces. Even though you don't
>>>> need namespaces. It doesn't good reason to refuse another use case.
>>>>
>>>
>>> This is tangent to the discussion, we need to revisit why an application
>>> other than a daemon managing a set of memcgs would ever need to know the
>>> information in /proc/meminfo. No use-case was ever presented in the
>>> changelog and its not clear how this is at all relevant. So before
>>> changing the kernel, please describe how this actually matters in a real-
>>> world scenario.
>>
>> Huh? Don't you know a meanings of a namespace ISOLATION? isolation mean,
>> isolated container shouldn't be able to access global information. If you
>> want to lean container/namespace concept, tasting openvz or solaris container
>> is a good start.
>>
>> But anyway, I dislike current implementaion. So, I NAK this patch too.
>
> Could you give us advice for improving this ? What idea do you have ?

?
I'm ok your idea. I only NAKed current Gao's patch.
Glauber Costa
2012-05-31 08:29:01 UTC
Permalink
On 05/31/2012 10:17 AM, David Rientjes wrote:
>> > Hmm....maybe need to mount cgroup in the container (again) and get an access
>> > to cgroup
>> > hierarchy and find the cgroup it belongs to......if it's allowed.
> An application should always know the cgroup that its attached to and be
> able to read its state using the command that I gave earlier.

I disagree. For simple applications, yes. For full containers, the
cgroup in which it lives in is considered part of the external world.
It should not have any kind of access to it.

Also, tools need to work transparently.

That's the same case with /proc/stat, used for top. We can't expect
people to recode top to work on containerized environments.
Zhu Yanhai
2012-06-07 23:18:25 UTC
Permalink
2012/5/29 Gao feng <***@cn.fujitsu.com>:
> cgroup and namespaces are used for creating containers but some of
> information is not isolated/virtualized. This patch is for isolating /proc/meminfo
> information per container, which uses memory cgroup. By this, top,free
> and other tools under container can work as expected(show container's
> usage) without changes.
>
> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
> is under a memcg other than root.
>
> we show /proc/meminfo base on container's memory cgroup.
> because there are lots of info can't be provide by memcg, and
> the cmds such as top, free just use some entries of /proc/meminfo,
> we replace those entries by memory cgroup.
>
> if container has no memcg, we will show host's /proc/meminfo
> as before.
>
> there is no idea how to deal with Buffers,I just set it zero,
> It's strange if Buffers bigger than MemTotal.
>
> Signed-off-by: Gao feng <***@cn.fujitsu.com>
> ---
>  fs/proc/meminfo.c          |   11 +++++---
>  include/linux/memcontrol.h |   15 +++++++++++
>  mm/memcontrol.c            |   56 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 78 insertions(+), 4 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 80e4645..29a1fcd 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -13,6 +13,7 @@
>  #include <linux/atomic.h>
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
> +#include <linux/memcontrol.h>
>  #include "internal.h"
>
>  void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
> @@ -27,7 +28,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
>        struct vmalloc_info vmi;
>        long cached;
>        unsigned long pages[NR_LRU_LISTS];
> -       int lru;
>
>  /*
>  * display in kilobytes.
> @@ -39,16 +39,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
>        allowed = ((totalram_pages - hugetlb_total_pages())
>                * sysctl_overcommit_ratio / 100) + total_swap_pages;
>
> +       memcg_meminfo(&i);
>        cached = global_page_state(NR_FILE_PAGES) -
>                        total_swapcache_pages - i.bufferram;
> +       /*
> +        * If 'current' is in root memory cgroup, returns global status.
> +        * If not, returns the status of memcg under which current runs.
> +        */
> +       sys_page_state(pages, &cached);
>        if (cached < 0)
>                cached = 0;
>
>        get_vmalloc_info(&vmi);
>
> -       for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> -               pages[lru] = global_page_state(NR_LRU_BASE + lru);
> -
>        /*
>         * Tagged format, for easy grepping and expansion.
>         */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0316197..6220764 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -21,6 +21,7 @@
>  #define _LINUX_MEMCONTROL_H
>  #include <linux/cgroup.h>
>  #include <linux/vm_event_item.h>
> +#include <linux/mm.h>
>
>  struct mem_cgroup;
>  struct page_cgroup;
> @@ -116,6 +117,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
>                                   struct mem_cgroup_reclaim_cookie *);
>  void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
>
> +extern void memcg_meminfo(struct sysinfo *si);
> +extern void sys_page_state(unsigned long *page, long *cached);
> +
>  /*
>  * For memory reclaim.
>  */
> @@ -323,6 +327,17 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
>  {
>  }
>
> +static inline void memcg_meminfo(struct sysinfo *si)
> +{
> +}
> +
> +static inline void sys_page_state(unsigned long *pages, long *cached)
> +{
> +       int lru;
> +       for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> +               pages[lru] = global_page_state(NR_LRU_BASE + lru);
> +}
> +
>  static inline bool mem_cgroup_disabled(void)
>  {
>        return true;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f142ea9..c25e160 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -52,6 +52,7 @@
>  #include "internal.h"
>  #include <net/sock.h>
>  #include <net/tcp_memcontrol.h>
> +#include <linux/pid_namespace.h>
>
>  #include <asm/uaccess.h>
>
> @@ -4345,6 +4346,61 @@ mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
>                mem_cgroup_get_local_stat(iter, s);
>  }
>
> +void memcg_meminfo(struct sysinfo *val)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_task(current);
> +       __kernel_ulong_t totalram, totalswap;
> +       if (current->nsproxy->pid_ns == &init_pid_ns ||
> +           memcg == NULL || mem_cgroup_is_root(memcg))
> +               return;
This is somehow not exactly the same with your description "if 'current'
> is under a memcg other than root." since you are checking both namespace and cgroup constraint.
> +
> +       totalram = res_counter_read_u64(&memcg->res,
> +                                       RES_LIMIT) >> PAGE_SHIFT;
> +       if (totalram < val->totalram) {
> +               __kernel_ulong_t usageram;
> +               usageram = res_counter_read_u64(&memcg->res,
> +                                               RES_USAGE) >> PAGE_SHIFT;
> +               val->totalram = totalram;
> +               val->freeram = totalram - usageram;
> +               val->bufferram = 0;
> +               val->totalhigh = 0;
> +               val->freehigh = 0;
> +       } else
> +               return;
You don't want to show the local swap usage and limit (below code) if
mem.hard_limit_in_bytes is larger than the physical memory capacity?
Why?
> +
> +       totalswap = res_counter_read_u64(&memcg->memsw,
> +                                        RES_LIMIT) >> PAGE_SHIFT;
> +       if (totalswap < val->totalswap) {
here, do you need also to check memcg->memsw_is_minimum? Local
swapping is disabled if memsw_is_minium is true.
> +               __kernel_ulong_t usageswap;
> +               usageswap = res_counter_read_u64(&memcg->memsw,
> +                                                RES_USAGE) >> PAGE_SHIFT;
> +               val->totalswap = totalswap - val->totalram;
> +               val->freeswap = totalswap - usageswap - val->freeram;
This is seriously broken, memsw.limit_in_bytes means the max swap
space you can get, and under global memory pressure all physical
memory belongs to a memcg might get dump into disk, until it reaches
the limit set by memsw.limit_in_bytes. So you should not subtract the
physical memory usage.
> +       }
And you need a 'else' statement here, although memsw.limit_in_bytes
might be larger than val->totalswap, it's still wrong to show the user
a global swap usage other than your local one, right?

> +}
> +
> +void sys_page_state(unsigned long *pages, long *cached)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_task(current);
> +
> +       if (current->nsproxy->pid_ns == &init_pid_ns ||
> +           memcg == NULL || mem_cgroup_is_root(memcg)) {
> +               int lru;
> +               for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> +                       pages[lru] = global_page_state(NR_LRU_BASE + lru);
> +       } else {
> +               struct mcs_total_stat s;
> +               memset(&s, 0, sizeof(s));
> +               mem_cgroup_get_total_stat(memcg, &s);
> +               *cached = s.stat[MCS_CACHE] >> PAGE_SHIFT;
> +               pages[LRU_ACTIVE_ANON] = s.stat[MCS_ACTIVE_ANON] >> PAGE_SHIFT;
> +               pages[LRU_ACTIVE_FILE] = s.stat[MCS_ACTIVE_FILE] >> PAGE_SHIFT;
> +               pages[LRU_INACTIVE_ANON] = s.stat[MCS_INACTIVE_ANON] >> PAGE_SHIFT;
> +               pages[LRU_INACTIVE_FILE] = s.stat[MCS_INACTIVE_FILE] >> PAGE_SHIFT;
> +               pages[LRU_UNEVICTABLE] = s.stat[MCS_UNEVICTABLE] >> PAGE_SHIFT;
> +       }
> +}
> +
>  #ifdef CONFIG_NUMA
>  static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
>  {
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to ***@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

We have a highly similar patch with yours here on a large production
system, AFAICS it's absolutely unaccepted to require correcting the
userland tools to read the stat file exported by the various cgroups,
which have different formats and locations and names with the legacy
ones i.e. /proc/stat, /proc/meminfo and /proc/loadavg. There are tons
of the tools and it's impossible to do so (just think about,
top/sar/vmstat/mpstat/free/who, and various userland library read the
number directly from above files, setup their thread pool or memory
pool according to the available resource).

--
Thanks,
Zhu Yanhai
Loading...