[PATCH 0/6] mm/page_owner: use tackdepot to store stacktrace

classic Classic list List threaded Threaded
27 messages Options
12
Reply | Threaded
Open this post in threaded view
|

[PATCH 0/6] mm/page_owner: use tackdepot to store stacktrace

js1304
From: Joonsoo Kim <[hidden email]>

This patchset changes a way to store stacktrace in page_owner in order to
reduce memory usage. Below is motivation of this patchset coped
from the patch 6.

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory. This causes the problem
that memory tight system doesn't work well if page_owner is enabled.
Moreover, even with this large memory consumption, we cannot get full
stacktrace because we allocate memory at boot time and just maintain
8 stacktrace slots to balance memory consumption. We could increase it
to more but it would make system unusable or change system behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.

Thanks.

Joonsoo Kim (6):
  mm/compaction: split freepages without holding the zone lock
  mm/page_owner: initialize page owner without holding the zone lock
  mm/page_owner: copy last_migrate_reason in copy_page_owner()
  mm/page_owner: introduce split_page_owner and replace manual handling
  tools/vm/page_owner: increase temporary buffer size
  mm/page_owner: use stackdepot to store stacktrace

 include/linux/mm.h         |   1 -
 include/linux/page_ext.h   |   4 +-
 include/linux/page_owner.h |  12 ++--
 lib/Kconfig.debug          |   1 +
 mm/compaction.c            |  45 +++++++++++----
 mm/page_alloc.c            |  37 +-----------
 mm/page_isolation.c        |   9 ++-
 mm/page_owner.c            | 136 ++++++++++++++++++++++++++++++++++++++-------
 tools/vm/page_owner_sort.c |   9 ++-
 9 files changed, 173 insertions(+), 81 deletions(-)

--
1.9.1

Reply | Threaded
Open this post in threaded view
|

[PATCH 1/6] mm/compaction: split freepages without holding the zone lock

js1304
From: Joonsoo Kim <[hidden email]>

We don't need to split freepages with holding the zone lock. It will cause
more contention on zone lock so not desirable.

Signed-off-by: Joonsoo Kim <[hidden email]>
---
 include/linux/mm.h |  1 -
 mm/compaction.c    | 42 ++++++++++++++++++++++++++++++------------
 mm/page_alloc.c    | 27 ---------------------------
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b52750..9608f33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -523,7 +523,6 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
-int split_free_page(struct page *page);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index c9a95c1..ecf0252 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -65,13 +65,31 @@ static unsigned long release_freepages(struct list_head *freelist)
 
 static void map_pages(struct list_head *list)
 {
- struct page *page;
+ unsigned int i, order, nr_pages;
+ struct page *page, *next;
+ LIST_HEAD(tmp_list);
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+
+ order = page_private(page);
+ nr_pages = 1 << order;
+ set_page_private(page, 0);
+ set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
+ kernel_map_pages(page, nr_pages, 1);
+ kasan_alloc_pages(page, order);
+ if (order)
+ split_page(page, order);
 
- list_for_each_entry(page, list, lru) {
- arch_alloc_page(page, 0);
- kernel_map_pages(page, 1, 1);
- kasan_alloc_pages(page, 0);
+ for (i = 0; i < nr_pages; i++) {
+ list_add(&page->lru, &tmp_list);
+ page++;
+ }
  }
+
+ list_splice(&tmp_list, list);
 }
 
 static inline bool migrate_async_suitable(int migratetype)
@@ -368,12 +386,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
  unsigned long flags = 0;
  bool locked = false;
  unsigned long blockpfn = *start_pfn;
+ unsigned int order;
 
  cursor = pfn_to_page(blockpfn);
 
  /* Isolate free pages. */
  for (; blockpfn < end_pfn; blockpfn++, cursor++) {
- int isolated, i;
+ int isolated;
  struct page *page = cursor;
 
  /*
@@ -439,13 +458,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
  goto isolate_fail;
  }
 
- /* Found a free page, break it into order-0 pages */
- isolated = split_free_page(page);
+ /* Found a free page, will break it into order-0 pages */
+ order = page_order(page);
+ isolated = __isolate_free_page(page, page_order(page));
+ set_page_private(page, order);
  total_isolated += isolated;
- for (i = 0; i < isolated; i++) {
- list_add(&page->lru, freelist);
- page++;
- }
+ list_add_tail(&page->lru, freelist);
 
  /* If a page was split, advance to the end of it */
  if (isolated) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dd65d9..60d7f10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2532,33 +2532,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
 }
 
 /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
- unsigned int order;
- int nr_pages;
-
- order = page_order(page);
-
- nr_pages = __isolate_free_page(page, order);
- if (!nr_pages)
- return 0;
-
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
- return nr_pages;
-}
-
-/*
  * Update NUMA hit/miss statistics
  *
  * Must be called with interrupts disabled.
--
1.9.1

Reply | Threaded
Open this post in threaded view
|

[PATCH 2/6] mm/page_owner: initialize page owner without holding the zone lock

js1304
In reply to this post by js1304
From: Joonsoo Kim <[hidden email]>

It's not necessary to initialized page_owner with holding the zone lock.
It would cause more contention on the zone lock although it's not
a big problem since it is just debug feature. But, it is better
than before so do it. This is also preparation step to use stackdepot
in page owner feature. Stackdepot allocates new pages when there is no
reserved space and holding the zone lock in this case will cause deadlock.

Signed-off-by: Joonsoo Kim <[hidden email]>
---
 mm/compaction.c     | 3 +++
 mm/page_alloc.c     | 2 --
 mm/page_isolation.c | 9 ++++++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ecf0252..dbad95b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -20,6 +20,7 @@
 #include <linux/kasan.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/page_owner.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -80,6 +81,8 @@ static void map_pages(struct list_head *list)
  arch_alloc_page(page, order);
  kernel_map_pages(page, nr_pages, 1);
  kasan_alloc_pages(page, order);
+
+ set_page_owner(page, order, __GFP_MOVABLE);
  if (order)
  split_page(page, order);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60d7f10..5b632be 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2514,8 +2514,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
  zone->free_area[order].nr_free--;
  rmv_page_order(page);
 
- set_page_owner(page, order, __GFP_MOVABLE);
-
  /* Set the pageblock if the isolated page is at least a pageblock */
  if (order >= pageblock_order - 1) {
  struct page *endpage = page + (1 << order) - 1;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 612122b..927f5ee 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -7,6 +7,7 @@
 #include <linux/pageblock-flags.h>
 #include <linux/memory.h>
 #include <linux/hugetlb.h>
+#include <linux/page_owner.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -108,8 +109,6 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
  if (pfn_valid_within(page_to_pfn(buddy)) &&
     !is_migrate_isolate_page(buddy)) {
  __isolate_free_page(page, order);
- kernel_map_pages(page, (1 << order), 1);
- set_page_refcounted(page);
  isolated_page = page;
  }
  }
@@ -128,8 +127,12 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
  zone->nr_isolate_pageblock--;
 out:
  spin_unlock_irqrestore(&zone->lock, flags);
- if (isolated_page)
+ if (isolated_page) {
+ kernel_map_pages(page, (1 << order), 1);
+ set_page_refcounted(page);
+ set_page_owner(page, order, __GFP_MOVABLE);
  __free_pages(isolated_page, order);
+ }
 }
 
 static inline struct page *
--
1.9.1

Reply | Threaded
Open this post in threaded view
|

[PATCH 3/6] mm/page_owner: copy last_migrate_reason in copy_page_owner()

js1304
In reply to this post by js1304
From: Joonsoo Kim <[hidden email]>

Currently, copy_page_owner() doesn't copy all the owner information.
It skips last_migrate_reason because copy_page_owner() is used for
migration and it will be properly set soon. But, following patch
will use copy_page_owner() and this skip will cause the problem that
allocated page has uninitialied last_migrate_reason. To prevent it,
this patch also copy last_migrate_reason in copy_page_owner().

Signed-off-by: Joonsoo Kim <[hidden email]>
---
 mm/page_owner.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 792b56d..6693959 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -101,6 +101,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 
  new_ext->order = old_ext->order;
  new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->last_migrate_reason = old_ext->last_migrate_reason;
  new_ext->nr_entries = old_ext->nr_entries;
 
  for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
--
1.9.1

Reply | Threaded
Open this post in threaded view
|

[PATCH 5/6] tools/vm/page_owner: increase temporary buffer size

js1304
In reply to this post by js1304
From: Joonsoo Kim <[hidden email]>

Page owner will be changed to store more deep stacktrace
so current temporary buffer size isn't enough. Increase it.

Signed-off-by: Joonsoo Kim <[hidden email]>
---
 tools/vm/page_owner_sort.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 77147b4..f1c055f 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -79,12 +79,12 @@ static void add_list(char *buf, int len)
  }
 }
 
-#define BUF_SIZE 1024
+#define BUF_SIZE (128 * 1024)
 
 int main(int argc, char **argv)
 {
  FILE *fin, *fout;
- char buf[BUF_SIZE];
+ char *buf;
  int ret, i, count;
  struct block_list *list2;
  struct stat st;
@@ -107,6 +107,11 @@ int main(int argc, char **argv)
  max_size = st.st_size / 100; /* hack ... */
 
  list = malloc(max_size * sizeof(*list));
+ buf = malloc(BUF_SIZE);
+ if (!list || !buf) {
+ printf("Out of memory\n");
+ exit(1);
+ }
 
  for ( ; ; ) {
  ret = read_block(buf, BUF_SIZE, fin);
--
1.9.1

Reply | Threaded
Open this post in threaded view
|

[PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

js1304
In reply to this post by js1304
From: Joonsoo Kim <[hidden email]>

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory. This causes the problem
that memory tight system doesn't work well if page_owner is enabled.
Moreover, even with this large memory consumption, we cannot get full
stacktrace because we allocate memory at boot time and just maintain
8 stacktrace slots to balance memory consumption. We could increase it
to more but it would make system unusable or change system behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.
It obviously provides memory saving but there is a drawback that
stackdepot could fail.

stackdepot allocates memory at runtime so it could fail if system has
not enough memory. But, most of allocation stack are generated at very
early time and there are much memory at this time. So, failure would not
happen easily. And, one failure means that we miss just one page's
allocation stacktrace so it would not be a big problem. In this patch,
when memory allocation failure happens, we store special stracktrace
handle to the page that is failed to save stacktrace. With it, user
can guess memory usage properly even if failure happens.

Memory saving looks as following. (Boot 4GB memory system with page_owner)

92274688 bytes -> 25165824 bytes

72% reduction in static allocation size. Even if we should add up size of
dynamic allocation memory, it would not that big because stacktrace is
mostly duplicated.

Note that implementation looks complex than someone would imagine because
there is recursion issue. stackdepot uses page allocator and page_owner
is called at page allocation. Using stackdepot in page_owner could re-call
page allcator and then page_owner. That is a recursion. To detect and
avoid it, whenever we obtain stacktrace, recursion is checked and
page_owner is set to dummy information if found. Dummy information means
that this page is allocated for page_owner feature itself
(such as stackdepot) and it's understandable behavior for user.

Signed-off-by: Joonsoo Kim <[hidden email]>
---
 include/linux/page_ext.h |   4 +-
 lib/Kconfig.debug        |   1 +
 mm/page_owner.c          | 128 ++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 114 insertions(+), 19 deletions(-)

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e1fe7cf..03f2a3e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
 
 struct pglist_data;
 struct page_ext_operations {
@@ -44,9 +45,8 @@ struct page_ext {
 #ifdef CONFIG_PAGE_OWNER
  unsigned int order;
  gfp_t gfp_mask;
- unsigned int nr_entries;
  int last_migrate_reason;
- unsigned long trace_entries[8];
+ depot_stack_handle_t handle;
 #endif
 };
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5d57177..a32fd24 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -248,6 +248,7 @@ config PAGE_OWNER
  depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
  select DEBUG_FS
  select STACKTRACE
+ select STACKDEPOT
  select PAGE_EXTENSION
  help
   This keeps track of what call chain is the owner of a page, may
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 7b5a834..7875de5 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -7,11 +7,18 @@
 #include <linux/page_owner.h>
 #include <linux/jump_label.h>
 #include <linux/migrate.h>
+#include <linux/stackdepot.h>
+
 #include "internal.h"
 
+#define PAGE_OWNER_STACK_DEPTH (64)
+
 static bool page_owner_disabled = true;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
+static depot_stack_handle_t dummy_handle;
+static depot_stack_handle_t failure_handle;
+
 static void init_early_allocated_pages(void);
 
 static int early_page_owner_param(char *buf)
@@ -34,11 +41,41 @@ static bool need_page_owner(void)
  return true;
 }
 
+static noinline void register_dummy_stack(void)
+{
+ unsigned long entries[4];
+ struct stack_trace dummy;
+
+ dummy.nr_entries = 0;
+ dummy.max_entries = ARRAY_SIZE(entries);
+ dummy.entries = &entries[0];
+ dummy.skip = 0;
+
+ save_stack_trace(&dummy);
+ dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+}
+
+static noinline void register_failure_stack(void)
+{
+ unsigned long entries[4];
+ struct stack_trace failure;
+
+ failure.nr_entries = 0;
+ failure.max_entries = ARRAY_SIZE(entries);
+ failure.entries = &entries[0];
+ failure.skip = 0;
+
+ save_stack_trace(&failure);
+ failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+}
+
 static void init_page_owner(void)
 {
  if (page_owner_disabled)
  return;
 
+ register_dummy_stack();
+ register_failure_stack();
  static_branch_enable(&page_owner_inited);
  init_early_allocated_pages();
 }
@@ -59,21 +96,56 @@ void __reset_page_owner(struct page *page, unsigned int order)
  }
 }
 
-void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+static inline bool check_recursive_alloc(struct stack_trace *trace,
+ unsigned long ip)
 {
- struct page_ext *page_ext = lookup_page_ext(page);
+ int i, count;
+
+ if (!trace->nr_entries)
+ return false;
+
+ for (i = 0, count = 0; i < trace->nr_entries; i++) {
+ if (trace->entries[i] == ip && ++count == 2)
+ return true;
+ }
+
+ return false;
+}
+
+static noinline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
  struct stack_trace trace = {
  .nr_entries = 0,
- .max_entries = ARRAY_SIZE(page_ext->trace_entries),
- .entries = &page_ext->trace_entries[0],
- .skip = 3,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
  };
+ depot_stack_handle_t handle;
 
  save_stack_trace(&trace);
+ if (trace.nr_entries != 0 &&
+    trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ if (check_recursive_alloc(&trace, _RET_IP_))
+ return dummy_handle;
+
+ handle = depot_save_stack(&trace, flags);
+ if (!handle)
+ handle = failure_handle;
+
+ return handle;
+}
 
+noinline void __set_page_owner(struct page *page, unsigned int order,
+ gfp_t gfp_mask)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->handle = save_stack(gfp_mask);
  page_ext->order = order;
  page_ext->gfp_mask = gfp_mask;
- page_ext->nr_entries = trace.nr_entries;
  page_ext->last_migrate_reason = -1;
 
  __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
@@ -100,15 +172,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 {
  struct page_ext *old_ext = lookup_page_ext(oldpage);
  struct page_ext *new_ext = lookup_page_ext(newpage);
- int i;
 
  new_ext->order = old_ext->order;
  new_ext->gfp_mask = old_ext->gfp_mask;
  new_ext->last_migrate_reason = old_ext->last_migrate_reason;
- new_ext->nr_entries = old_ext->nr_entries;
-
- for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
- new_ext->trace_entries[i] = old_ext->trace_entries[i];
+ new_ext->handle = old_ext->handle;
 
  /*
  * We don't clear the bit on the oldpage as it's going to be freed
@@ -124,14 +192,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 
 static ssize_t
 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
- struct page *page, struct page_ext *page_ext)
+ struct page *page, struct page_ext *page_ext,
+ depot_stack_handle_t handle)
 {
  int ret;
  int pageblock_mt, page_mt;
  char *kbuf;
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
  struct stack_trace trace = {
- .nr_entries = page_ext->nr_entries,
- .entries = &page_ext->trace_entries[0],
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
  };
 
  kbuf = kmalloc(count, GFP_KERNEL);
@@ -160,6 +232,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
  if (ret >= count)
  goto err;
 
+ depot_fetch_stack(handle, &trace);
  ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
  if (ret >= count)
  goto err;
@@ -190,10 +263,14 @@ err:
 void __dump_page_owner(struct page *page)
 {
  struct page_ext *page_ext = lookup_page_ext(page);
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
  struct stack_trace trace = {
- .nr_entries = page_ext->nr_entries,
- .entries = &page_ext->trace_entries[0],
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
  };
+ depot_stack_handle_t handle;
  gfp_t gfp_mask = page_ext->gfp_mask;
  int mt = gfpflags_to_migratetype(gfp_mask);
 
@@ -202,6 +279,13 @@ void __dump_page_owner(struct page *page)
  return;
  }
 
+ handle = READ_ONCE(page_ext->handle);
+ if (!handle) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ depot_fetch_stack(handle, &trace);
  pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
  page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
  print_stack_trace(&trace, 0);
@@ -217,6 +301,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  unsigned long pfn;
  struct page *page;
  struct page_ext *page_ext;
+ depot_stack_handle_t handle;
 
  if (!static_branch_unlikely(&page_owner_inited))
  return -EINVAL;
@@ -263,10 +348,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
  continue;
 
+ /*
+ * Access to page_ext->handle isn't synchronous so we should
+ * be careful to access it.
+ */
+ handle = READ_ONCE(page_ext->handle);
+ if (!handle)
+ continue;
+
  /* Record the next PFN to read in the file offset */
  *ppos = (pfn - min_low_pfn) + 1;
 
- return print_page_owner(buf, count, pfn, page, page_ext);
+ return print_page_owner(buf, count, pfn, page,
+ page_ext, handle);
  }
 
  return 0;
--
1.9.1

Reply | Threaded
Open this post in threaded view
|

[PATCH 4/6] mm/page_owner: introduce split_page_owner and replace manual handling

js1304
In reply to this post by js1304
From: Joonsoo Kim <[hidden email]>

split_page() calls set_page_owner() to set up page_owner to each pages.
But, it has a drawback that head page and the others have different
stacktrace because callsite of set_page_owner() is slightly differnt.
To avoid this problem, this patch copies head page's page_owner to
the others. It needs to introduce new function, split_page_owner() but
it also remove the other function, get_page_owner_gfp() so looks good
to do.

Signed-off-by: Joonsoo Kim <[hidden email]>
---
 include/linux/page_owner.h | 12 +++++-------
 mm/page_alloc.c            |  8 ++------
 mm/page_owner.c            |  7 +++++--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 46f1b93..30583ab 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops;
 extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
  unsigned int order, gfp_t gfp_mask);
-extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __split_page_owner(struct page *page, unsigned int order);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(struct page *page);
@@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page,
  __set_page_owner(page, order, gfp_mask);
 }
 
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page, unsigned int order)
 {
  if (static_branch_unlikely(&page_owner_inited))
- return __get_page_owner_gfp(page);
- else
- return 0;
+ __split_page_owner(page, order);
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
@@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page,
  unsigned int order, gfp_t gfp_mask)
 {
 }
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page,
+ unsigned int order)
 {
- return 0;
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b632be..7cefc90 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2466,7 +2466,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 void split_page(struct page *page, unsigned int order)
 {
  int i;
- gfp_t gfp_mask;
 
  VM_BUG_ON_PAGE(PageCompound(page), page);
  VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2480,12 +2479,9 @@ void split_page(struct page *page, unsigned int order)
  split_page(virt_to_page(page[0].shadow), order);
 #endif
 
- gfp_mask = get_page_owner_gfp(page);
- set_page_owner(page, 0, gfp_mask);
- for (i = 1; i < (1 << order); i++) {
+ for (i = 1; i < (1 << order); i++)
  set_page_refcounted(page + i);
- set_page_owner(page + i, 0, gfp_mask);
- }
+ split_page_owner(page, order);
 }
 EXPORT_SYMBOL_GPL(split_page);
 
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 6693959..7b5a834 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -86,11 +86,14 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
  page_ext->last_migrate_reason = reason;
 }
 
-gfp_t __get_page_owner_gfp(struct page *page)
+void __split_page_owner(struct page *page, unsigned int order)
 {
  struct page_ext *page_ext = lookup_page_ext(page);
+ int i;
 
- return page_ext->gfp_mask;
+ page_ext->order = 0;
+ for (i = 1; i < (1 << order); i++)
+ __copy_page_owner(page, page + i);
 }
 
 void __copy_page_owner(struct page *oldpage, struct page *newpage)
--
1.9.1

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Michal Hocko-4
In reply to this post by js1304
On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:

> From: Joonsoo Kim <[hidden email]>
>
> Currently, we store each page's allocation stacktrace on corresponding
> page_ext structure and it requires a lot of memory. This causes the problem
> that memory tight system doesn't work well if page_owner is enabled.
> Moreover, even with this large memory consumption, we cannot get full
> stacktrace because we allocate memory at boot time and just maintain
> 8 stacktrace slots to balance memory consumption. We could increase it
> to more but it would make system unusable or change system behaviour.
>
> To solve the problem, this patch uses stackdepot to store stacktrace.
> It obviously provides memory saving but there is a drawback that
> stackdepot could fail.
>
> stackdepot allocates memory at runtime so it could fail if system has
> not enough memory. But, most of allocation stack are generated at very
> early time and there are much memory at this time. So, failure would not
> happen easily. And, one failure means that we miss just one page's
> allocation stacktrace so it would not be a big problem. In this patch,
> when memory allocation failure happens, we store special stracktrace
> handle to the page that is failed to save stacktrace. With it, user
> can guess memory usage properly even if failure happens.
>
> Memory saving looks as following. (Boot 4GB memory system with page_owner)
>
> 92274688 bytes -> 25165824 bytes

It is not clear to me whether this is after a fresh boot or some workload
which would grow the stack depot as well. What is a usual cap for the
memory consumption.

> 72% reduction in static allocation size. Even if we should add up size of
> dynamic allocation memory, it would not that big because stacktrace is
> mostly duplicated.
>
> Note that implementation looks complex than someone would imagine because
> there is recursion issue. stackdepot uses page allocator and page_owner
> is called at page allocation. Using stackdepot in page_owner could re-call
> page allcator and then page_owner. That is a recursion.

This is rather fragile. How do we check there is no lock dependency
introduced later on - e.g. split_page called from a different
locking/reclaim context than alloc_pages? Would it be safer to
use ~__GFP_DIRECT_RECLAIM for those stack allocations? Or do you think
there would be too many failed allocations? This alone wouldn't remove a
need for the recursion detection but it sounds less tricky.

> To detect and
> avoid it, whenever we obtain stacktrace, recursion is checked and
> page_owner is set to dummy information if found. Dummy information means
> that this page is allocated for page_owner feature itself
> (such as stackdepot) and it's understandable behavior for user.
>
> Signed-off-by: Joonsoo Kim <[hidden email]>

I like the idea in general I just wish this would be less subtle. Few
more comments below.

[...]

> -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
> +static inline bool check_recursive_alloc(struct stack_trace *trace,
> + unsigned long ip)
>  {
> - struct page_ext *page_ext = lookup_page_ext(page);
> + int i, count;
> +
> + if (!trace->nr_entries)
> + return false;
> +
> + for (i = 0, count = 0; i < trace->nr_entries; i++) {
> + if (trace->entries[i] == ip && ++count == 2)
> + return true;
> + }

This would deserve a comment I guess. Btw, don't we have a better and
more robust way to detect the recursion? Per task_struct flag or
something like that?

[...]

> +static noinline depot_stack_handle_t save_stack(gfp_t flags)
> +{
> + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
>   struct stack_trace trace = {
>   .nr_entries = 0,
> - .max_entries = ARRAY_SIZE(page_ext->trace_entries),
> - .entries = &page_ext->trace_entries[0],
> - .skip = 3,
> + .entries = entries,
> + .max_entries = PAGE_OWNER_STACK_DEPTH,
> + .skip = 0
>   };
[...]
>  void __dump_page_owner(struct page *page)
>  {
>   struct page_ext *page_ext = lookup_page_ext(page);
> + unsigned long entries[PAGE_OWNER_STACK_DEPTH];

This is worrying because of the excessive stack consumption while we
might be in a deep call chain already. Can we preallocate a hash table
for few buffers when the feature is enabled? This would require locking
of course but chances are that contention wouldn't be that large.

>   struct stack_trace trace = {
> - .nr_entries = page_ext->nr_entries,
> - .entries = &page_ext->trace_entries[0],
> + .nr_entries = 0,
> + .entries = entries,
> + .max_entries = PAGE_OWNER_STACK_DEPTH,
> + .skip = 0
>   };
> + depot_stack_handle_t handle;
>   gfp_t gfp_mask = page_ext->gfp_mask;
>   int mt = gfpflags_to_migratetype(gfp_mask);
>  

Thanks!
--
Michal Hocko
SUSE Labs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Joonsoo Kim-2
On Tue, May 03, 2016 at 10:53:56AM +0200, Michal Hocko wrote:

> On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:
> > From: Joonsoo Kim <[hidden email]>
> >
> > Currently, we store each page's allocation stacktrace on corresponding
> > page_ext structure and it requires a lot of memory. This causes the problem
> > that memory tight system doesn't work well if page_owner is enabled.
> > Moreover, even with this large memory consumption, we cannot get full
> > stacktrace because we allocate memory at boot time and just maintain
> > 8 stacktrace slots to balance memory consumption. We could increase it
> > to more but it would make system unusable or change system behaviour.
> >
> > To solve the problem, this patch uses stackdepot to store stacktrace.
> > It obviously provides memory saving but there is a drawback that
> > stackdepot could fail.
> >
> > stackdepot allocates memory at runtime so it could fail if system has
> > not enough memory. But, most of allocation stack are generated at very
> > early time and there are much memory at this time. So, failure would not
> > happen easily. And, one failure means that we miss just one page's
> > allocation stacktrace so it would not be a big problem. In this patch,
> > when memory allocation failure happens, we store special stracktrace
> > handle to the page that is failed to save stacktrace. With it, user
> > can guess memory usage properly even if failure happens.
> >
> > Memory saving looks as following. (Boot 4GB memory system with page_owner)
> >
> > 92274688 bytes -> 25165824 bytes
>
> It is not clear to me whether this is after a fresh boot or some workload
> which would grow the stack depot as well. What is a usual cap for the
> memory consumption.

It is static allocation size after a fresh boot. I didn't add size of
dynamic allocation memory so it could be larger a little. See below line.

>
> > 72% reduction in static allocation size. Even if we should add up size of
> > dynamic allocation memory, it would not that big because stacktrace is
> > mostly duplicated.
> >
> > Note that implementation looks complex than someone would imagine because
> > there is recursion issue. stackdepot uses page allocator and page_owner
> > is called at page allocation. Using stackdepot in page_owner could re-call
> > page allcator and then page_owner. That is a recursion.
>
> This is rather fragile. How do we check there is no lock dependency
> introduced later on - e.g. split_page called from a different
> locking/reclaim context than alloc_pages? Would it be safer to

There is no callsite that calls set_page_owner() with
__GFP_DIRECT_RECLAIM. So, there would be no lock/context dependency
now.

split_page() doesn't call set_page_owner(). Instead, it calls
split_page_owner() and just copies previous entry. Since it doesn't
require any new stackdepot entry, it is safe in any context.

> use ~__GFP_DIRECT_RECLAIM for those stack allocations? Or do you think
> there would be too many failed allocations? This alone wouldn't remove a
> need for the recursion detection but it sounds less tricky.
>
> > To detect and
> > avoid it, whenever we obtain stacktrace, recursion is checked and
> > page_owner is set to dummy information if found. Dummy information means
> > that this page is allocated for page_owner feature itself
> > (such as stackdepot) and it's understandable behavior for user.
> >
> > Signed-off-by: Joonsoo Kim <[hidden email]>
>
> I like the idea in general I just wish this would be less subtle. Few
> more comments below.
>
> [...]
> > -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
> > +static inline bool check_recursive_alloc(struct stack_trace *trace,
> > + unsigned long ip)
> >  {
> > - struct page_ext *page_ext = lookup_page_ext(page);
> > + int i, count;
> > +
> > + if (!trace->nr_entries)
> > + return false;
> > +
> > + for (i = 0, count = 0; i < trace->nr_entries; i++) {
> > + if (trace->entries[i] == ip && ++count == 2)
> > + return true;
> > + }
>
> This would deserve a comment I guess. Btw, don't we have a better and
> more robust way to detect the recursion? Per task_struct flag or
> something like that?

Okay. I will add a comment.

I already considered task_struct flag and I know that it is a better
solution. But, I don't think that this debugging feature deserve to
use such precious flag. This implementation isn't efficient but I
think that it is at least robust.

> [...]
> > +static noinline depot_stack_handle_t save_stack(gfp_t flags)
> > +{
> > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
> >   struct stack_trace trace = {
> >   .nr_entries = 0,
> > - .max_entries = ARRAY_SIZE(page_ext->trace_entries),
> > - .entries = &page_ext->trace_entries[0],
> > - .skip = 3,
> > + .entries = entries,
> > + .max_entries = PAGE_OWNER_STACK_DEPTH,
> > + .skip = 0
> >   };
> [...]
> >  void __dump_page_owner(struct page *page)
> >  {
> >   struct page_ext *page_ext = lookup_page_ext(page);
> > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
>
> This is worrying because of the excessive stack consumption while we
> might be in a deep call chain already. Can we preallocate a hash table
> for few buffers when the feature is enabled? This would require locking
> of course but chances are that contention wouldn't be that large.

Make sense but I'm not sure that excessive stack consumption would
cause real problem. For example, if direct reclaim is triggered during
allocation, it may go more deeper than this path. I'd like to
postpone to handle this issue until stack breakage is reported due to
this feature.

Thanks.

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Joonsoo Kim-2
On Wed, May 04, 2016 at 11:14:50AM +0900, Joonsoo Kim wrote:

> On Tue, May 03, 2016 at 10:53:56AM +0200, Michal Hocko wrote:
> > On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:
> > > From: Joonsoo Kim <[hidden email]>
> > >
> > > Currently, we store each page's allocation stacktrace on corresponding
> > > page_ext structure and it requires a lot of memory. This causes the problem
> > > that memory tight system doesn't work well if page_owner is enabled.
> > > Moreover, even with this large memory consumption, we cannot get full
> > > stacktrace because we allocate memory at boot time and just maintain
> > > 8 stacktrace slots to balance memory consumption. We could increase it
> > > to more but it would make system unusable or change system behaviour.
> > >
> > > To solve the problem, this patch uses stackdepot to store stacktrace.
> > > It obviously provides memory saving but there is a drawback that
> > > stackdepot could fail.
> > >
> > > stackdepot allocates memory at runtime so it could fail if system has
> > > not enough memory. But, most of allocation stack are generated at very
> > > early time and there are much memory at this time. So, failure would not
> > > happen easily. And, one failure means that we miss just one page's
> > > allocation stacktrace so it would not be a big problem. In this patch,
> > > when memory allocation failure happens, we store special stracktrace
> > > handle to the page that is failed to save stacktrace. With it, user
> > > can guess memory usage properly even if failure happens.
> > >
> > > Memory saving looks as following. (Boot 4GB memory system with page_owner)
> > >
> > > 92274688 bytes -> 25165824 bytes
> >
> > It is not clear to me whether this is after a fresh boot or some workload
> > which would grow the stack depot as well. What is a usual cap for the
> > memory consumption.
>
> It is static allocation size after a fresh boot. I didn't add size of
> dynamic allocation memory so it could be larger a little. See below line.
> >
> > > 72% reduction in static allocation size. Even if we should add up size of
> > > dynamic allocation memory, it would not that big because stacktrace is
> > > mostly duplicated.
> > >
> > > Note that implementation looks complex than someone would imagine because
> > > there is recursion issue. stackdepot uses page allocator and page_owner
> > > is called at page allocation. Using stackdepot in page_owner could re-call
> > > page allcator and then page_owner. That is a recursion.
> >
> > This is rather fragile. How do we check there is no lock dependency
> > introduced later on - e.g. split_page called from a different
> > locking/reclaim context than alloc_pages? Would it be safer to
>
> There is no callsite that calls set_page_owner() with
> __GFP_DIRECT_RECLAIM. So, there would be no lock/context dependency
> now.
>
> split_page() doesn't call set_page_owner(). Instead, it calls
> split_page_owner() and just copies previous entry. Since it doesn't
> require any new stackdepot entry, it is safe in any context.
>
> > use ~__GFP_DIRECT_RECLAIM for those stack allocations? Or do you think
> > there would be too many failed allocations? This alone wouldn't remove a
> > need for the recursion detection but it sounds less tricky.
> >
> > > To detect and
> > > avoid it, whenever we obtain stacktrace, recursion is checked and
> > > page_owner is set to dummy information if found. Dummy information means
> > > that this page is allocated for page_owner feature itself
> > > (such as stackdepot) and it's understandable behavior for user.
> > >
> > > Signed-off-by: Joonsoo Kim <[hidden email]>
> >
> > I like the idea in general I just wish this would be less subtle. Few
> > more comments below.
> >
> > [...]
> > > -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
> > > +static inline bool check_recursive_alloc(struct stack_trace *trace,
> > > + unsigned long ip)
> > >  {
> > > - struct page_ext *page_ext = lookup_page_ext(page);
> > > + int i, count;
> > > +
> > > + if (!trace->nr_entries)
> > > + return false;
> > > +
> > > + for (i = 0, count = 0; i < trace->nr_entries; i++) {
> > > + if (trace->entries[i] == ip && ++count == 2)
> > > + return true;
> > > + }
> >
> > This would deserve a comment I guess. Btw, don't we have a better and
> > more robust way to detect the recursion? Per task_struct flag or
> > something like that?
>
> Okay. I will add a comment.
>
> I already considered task_struct flag and I know that it is a better
> solution. But, I don't think that this debugging feature deserve to
> use such precious flag. This implementation isn't efficient but I
> think that it is at least robust.
>
> > [...]
> > > +static noinline depot_stack_handle_t save_stack(gfp_t flags)
> > > +{
> > > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
> > >   struct stack_trace trace = {
> > >   .nr_entries = 0,
> > > - .max_entries = ARRAY_SIZE(page_ext->trace_entries),
> > > - .entries = &page_ext->trace_entries[0],
> > > - .skip = 3,
> > > + .entries = entries,
> > > + .max_entries = PAGE_OWNER_STACK_DEPTH,
> > > + .skip = 0
> > >   };
> > [...]
> > >  void __dump_page_owner(struct page *page)
> > >  {
> > >   struct page_ext *page_ext = lookup_page_ext(page);
> > > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
> >
> > This is worrying because of the excessive stack consumption while we
> > might be in a deep call chain already. Can we preallocate a hash table
> > for few buffers when the feature is enabled? This would require locking
> > of course but chances are that contention wouldn't be that large.
>
> Make sense but I'm not sure that excessive stack consumption would
> cause real problem. For example, if direct reclaim is triggered during
> allocation, it may go more deeper than this path. I'd like to
> postpone to handle this issue until stack breakage is reported due to
> this feature.

Oops... I think more deeply and change my mind. In recursion case,
stack is consumed more than 1KB and it would be a problem. I think
that best approach is using preallocated per cpu entry. It will also
close recursion detection issue by paying interrupt on/off overhead.

Thanks.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Michal Hocko-4
In reply to this post by Joonsoo Kim-2
On Wed 04-05-16 11:14:50, Joonsoo Kim wrote:
> On Tue, May 03, 2016 at 10:53:56AM +0200, Michal Hocko wrote:
> > On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:
[...]

> > > Memory saving looks as following. (Boot 4GB memory system with page_owner)
> > >
> > > 92274688 bytes -> 25165824 bytes
> >
> > It is not clear to me whether this is after a fresh boot or some workload
> > which would grow the stack depot as well. What is a usual cap for the
> > memory consumption.
>
> It is static allocation size after a fresh boot. I didn't add size of
> dynamic allocation memory so it could be larger a little. See below line.
> >
> > > 72% reduction in static allocation size. Even if we should add up size of
> > > dynamic allocation memory, it would not that big because stacktrace is
> > > mostly duplicated.

This would be true only if most of the allocation stacks are basically
same after the boot which I am not really convinced is true. But you are
right that the number of sublicates will grow only a little. I was
interested about how much is that little ;)

> > > Note that implementation looks complex than someone would imagine because
> > > there is recursion issue. stackdepot uses page allocator and page_owner
> > > is called at page allocation. Using stackdepot in page_owner could re-call
> > > page allcator and then page_owner. That is a recursion.
> >
> > This is rather fragile. How do we check there is no lock dependency
> > introduced later on - e.g. split_page called from a different
> > locking/reclaim context than alloc_pages? Would it be safer to
>
> There is no callsite that calls set_page_owner() with
> __GFP_DIRECT_RECLAIM. So, there would be no lock/context dependency
> now.

I am confused now. prep_new_page is called with the gfp_mask of the
original request, no?

> split_page() doesn't call set_page_owner(). Instead, it calls
> split_page_owner() and just copies previous entry. Since it doesn't
> require any new stackdepot entry, it is safe in any context.

Ohh, you are right. I have missed patch 4
(http://lkml.kernel.org/r/1462252984-8524-5-git-send-email-iamjoonsoo.kim@...)

> > use ~__GFP_DIRECT_RECLAIM for those stack allocations? Or do you think
> > there would be too many failed allocations? This alone wouldn't remove a
> > need for the recursion detection but it sounds less tricky.
> >
> > > To detect and
> > > avoid it, whenever we obtain stacktrace, recursion is checked and
> > > page_owner is set to dummy information if found. Dummy information means
> > > that this page is allocated for page_owner feature itself
> > > (such as stackdepot) and it's understandable behavior for user.
> > >
> > > Signed-off-by: Joonsoo Kim <[hidden email]>
> >
> > I like the idea in general I just wish this would be less subtle. Few
> > more comments below.
> >
> > [...]
> > > -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
> > > +static inline bool check_recursive_alloc(struct stack_trace *trace,
> > > + unsigned long ip)
> > >  {
> > > - struct page_ext *page_ext = lookup_page_ext(page);
> > > + int i, count;
> > > +
> > > + if (!trace->nr_entries)
> > > + return false;
> > > +
> > > + for (i = 0, count = 0; i < trace->nr_entries; i++) {
> > > + if (trace->entries[i] == ip && ++count == 2)
> > > + return true;
> > > + }
> >
> > This would deserve a comment I guess. Btw, don't we have a better and
> > more robust way to detect the recursion? Per task_struct flag or
> > something like that?
>
> Okay. I will add a comment.
>
> I already considered task_struct flag and I know that it is a better
> solution. But, I don't think that this debugging feature deserve to
> use such precious flag. This implementation isn't efficient but I
> think that it is at least robust.

I guess there are many holes in task_structs where a single bool would
comfortably fit in. But I do not consider this to be a large issue. It
is just the above looks quite ugly.

> > [...]
> > > +static noinline depot_stack_handle_t save_stack(gfp_t flags)
> > > +{
> > > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
> > >   struct stack_trace trace = {
> > >   .nr_entries = 0,
> > > - .max_entries = ARRAY_SIZE(page_ext->trace_entries),
> > > - .entries = &page_ext->trace_entries[0],
> > > - .skip = 3,
> > > + .entries = entries,
> > > + .max_entries = PAGE_OWNER_STACK_DEPTH,
> > > + .skip = 0
> > >   };
> > [...]
> > >  void __dump_page_owner(struct page *page)
> > >  {
> > >   struct page_ext *page_ext = lookup_page_ext(page);
> > > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
> >
> > This is worrying because of the excessive stack consumption while we
> > might be in a deep call chain already. Can we preallocate a hash table
> > for few buffers when the feature is enabled? This would require locking
> > of course but chances are that contention wouldn't be that large.
>
> Make sense but I'm not sure that excessive stack consumption would
> cause real problem. For example, if direct reclaim is triggered during
> allocation, it may go more deeper than this path.

Do we really consume 512B of stack during reclaim. That sounds more than
worrying to me.

> I'd like to postpone to handle this issue until stack breakage is
> reported due to this feature.

I dunno, but I would expect that a debugging feature wouldn't cause
problems like that. It is more than sad when you cannot debug your
issue just because of the stack consumption...

--
Michal Hocko
SUSE Labs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Michal Hocko-4
In reply to this post by Joonsoo Kim-2
On Wed 04-05-16 11:35:00, Joonsoo Kim wrote:
[...]
> Oops... I think more deeply and change my mind. In recursion case,
> stack is consumed more than 1KB and it would be a problem. I think
> that best approach is using preallocated per cpu entry. It will also
> close recursion detection issue by paying interrupt on/off overhead.

I was thinking about per-cpu solution as well but the thing is that the
stackdepot will allocate and until you drop __GFP_DIRECT_RECLAIM then
per-cpu is not safe. I haven't checked the implamentation of
depot_save_stack but I assume it will not schedule in other places.
--
Michal Hocko
SUSE Labs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

js1304
In reply to this post by Michal Hocko-4
2016-05-04 18:21 GMT+09:00 Michal Hocko <[hidden email]>:

> On Wed 04-05-16 11:14:50, Joonsoo Kim wrote:
>> On Tue, May 03, 2016 at 10:53:56AM +0200, Michal Hocko wrote:
>> > On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:
> [...]
>> > > Memory saving looks as following. (Boot 4GB memory system with page_owner)
>> > >
>> > > 92274688 bytes -> 25165824 bytes
>> >
>> > It is not clear to me whether this is after a fresh boot or some workload
>> > which would grow the stack depot as well. What is a usual cap for the
>> > memory consumption.
>>
>> It is static allocation size after a fresh boot. I didn't add size of
>> dynamic allocation memory so it could be larger a little. See below line.
>> >
>> > > 72% reduction in static allocation size. Even if we should add up size of
>> > > dynamic allocation memory, it would not that big because stacktrace is
>> > > mostly duplicated.
>
> This would be true only if most of the allocation stacks are basically
> same after the boot which I am not really convinced is true. But you are
> right that the number of sublicates will grow only a little. I was
> interested about how much is that little ;)

After a fresh boot, it just uses 14 order-2 pages.

>> > > Note that implementation looks complex than someone would imagine because
>> > > there is recursion issue. stackdepot uses page allocator and page_owner
>> > > is called at page allocation. Using stackdepot in page_owner could re-call
>> > > page allcator and then page_owner. That is a recursion.
>> >
>> > This is rather fragile. How do we check there is no lock dependency
>> > introduced later on - e.g. split_page called from a different
>> > locking/reclaim context than alloc_pages? Would it be safer to
>>
>> There is no callsite that calls set_page_owner() with
>> __GFP_DIRECT_RECLAIM. So, there would be no lock/context dependency
>> now.
>
> I am confused now. prep_new_page is called with the gfp_mask of the
> original request, no?

Yes. I assume that set_page_owner() in prep_new_page() is okay. Sorry
for confusion.

>> split_page() doesn't call set_page_owner(). Instead, it calls
>> split_page_owner() and just copies previous entry. Since it doesn't
>> require any new stackdepot entry, it is safe in any context.
>
> Ohh, you are right. I have missed patch 4
> (http://lkml.kernel.org/r/1462252984-8524-5-git-send-email-iamjoonsoo.kim@...)
>
>> > use ~__GFP_DIRECT_RECLAIM for those stack allocations? Or do you think
>> > there would be too many failed allocations? This alone wouldn't remove a
>> > need for the recursion detection but it sounds less tricky.
>> >
>> > > To detect and
>> > > avoid it, whenever we obtain stacktrace, recursion is checked and
>> > > page_owner is set to dummy information if found. Dummy information means
>> > > that this page is allocated for page_owner feature itself
>> > > (such as stackdepot) and it's understandable behavior for user.
>> > >
>> > > Signed-off-by: Joonsoo Kim <[hidden email]>
>> >
>> > I like the idea in general I just wish this would be less subtle. Few
>> > more comments below.
>> >
>> > [...]
>> > > -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
>> > > +static inline bool check_recursive_alloc(struct stack_trace *trace,
>> > > +                                 unsigned long ip)
>> > >  {
>> > > - struct page_ext *page_ext = lookup_page_ext(page);
>> > > + int i, count;
>> > > +
>> > > + if (!trace->nr_entries)
>> > > +         return false;
>> > > +
>> > > + for (i = 0, count = 0; i < trace->nr_entries; i++) {
>> > > +         if (trace->entries[i] == ip && ++count == 2)
>> > > +                 return true;
>> > > + }
>> >
>> > This would deserve a comment I guess. Btw, don't we have a better and
>> > more robust way to detect the recursion? Per task_struct flag or
>> > something like that?
>>
>> Okay. I will add a comment.
>>
>> I already considered task_struct flag and I know that it is a better
>> solution. But, I don't think that this debugging feature deserve to
>> use such precious flag. This implementation isn't efficient but I
>> think that it is at least robust.
>
> I guess there are many holes in task_structs where a single bool would
> comfortably fit in. But I do not consider this to be a large issue. It
> is just the above looks quite ugly.
>
>> > [...]
>> > > +static noinline depot_stack_handle_t save_stack(gfp_t flags)
>> > > +{
>> > > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
>> > >   struct stack_trace trace = {
>> > >           .nr_entries = 0,
>> > > -         .max_entries = ARRAY_SIZE(page_ext->trace_entries),
>> > > -         .entries = &page_ext->trace_entries[0],
>> > > -         .skip = 3,
>> > > +         .entries = entries,
>> > > +         .max_entries = PAGE_OWNER_STACK_DEPTH,
>> > > +         .skip = 0
>> > >   };
>> > [...]
>> > >  void __dump_page_owner(struct page *page)
>> > >  {
>> > >   struct page_ext *page_ext = lookup_page_ext(page);
>> > > + unsigned long entries[PAGE_OWNER_STACK_DEPTH];
>> >
>> > This is worrying because of the excessive stack consumption while we
>> > might be in a deep call chain already. Can we preallocate a hash table
>> > for few buffers when the feature is enabled? This would require locking
>> > of course but chances are that contention wouldn't be that large.
>>
>> Make sense but I'm not sure that excessive stack consumption would
>> cause real problem. For example, if direct reclaim is triggered during
>> allocation, it may go more deeper than this path.
>
> Do we really consume 512B of stack during reclaim. That sounds more than
> worrying to me.

Hmm...I checked it by ./script/stackusage and result is as below.

shrink_zone() 128
shrink_zone_memcg() 248
shrink_active_list() 176

We have a call path that shrink_zone() -> shrink_zone_memcg() ->
shrink_active_list().
I'm not sure whether it is the deepest path or not.

>> I'd like to postpone to handle this issue until stack breakage is
>> reported due to this feature.
>
> I dunno, but I would expect that a debugging feature wouldn't cause
> problems like that. It is more than sad when you cannot debug your
> issue just because of the stack consumption...

Okay. I will think more.

Thanks.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

js1304
In reply to this post by Michal Hocko-4
2016-05-04 18:23 GMT+09:00 Michal Hocko <[hidden email]>:

> On Wed 04-05-16 11:35:00, Joonsoo Kim wrote:
> [...]
>> Oops... I think more deeply and change my mind. In recursion case,
>> stack is consumed more than 1KB and it would be a problem. I think
>> that best approach is using preallocated per cpu entry. It will also
>> close recursion detection issue by paying interrupt on/off overhead.
>
> I was thinking about per-cpu solution as well but the thing is that the
> stackdepot will allocate and until you drop __GFP_DIRECT_RECLAIM then
> per-cpu is not safe. I haven't checked the implamentation of
> depot_save_stack but I assume it will not schedule in other places.

I will think more.

Thanks for review!

Thanks.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

js1304
In reply to this post by js1304
2016-05-05 0:30 GMT+09:00 Joonsoo Kim <[hidden email]>:

> 2016-05-04 18:21 GMT+09:00 Michal Hocko <[hidden email]>:
>> On Wed 04-05-16 11:14:50, Joonsoo Kim wrote:
>>> On Tue, May 03, 2016 at 10:53:56AM +0200, Michal Hocko wrote:
>>> > On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:
>> [...]
>>> > > Memory saving looks as following. (Boot 4GB memory system with page_owner)
>>> > >
>>> > > 92274688 bytes -> 25165824 bytes
>>> >
>>> > It is not clear to me whether this is after a fresh boot or some workload
>>> > which would grow the stack depot as well. What is a usual cap for the
>>> > memory consumption.
>>>
>>> It is static allocation size after a fresh boot. I didn't add size of
>>> dynamic allocation memory so it could be larger a little. See below line.
>>> >
>>> > > 72% reduction in static allocation size. Even if we should add up size of
>>> > > dynamic allocation memory, it would not that big because stacktrace is
>>> > > mostly duplicated.
>>
>> This would be true only if most of the allocation stacks are basically
>> same after the boot which I am not really convinced is true. But you are
>> right that the number of sublicates will grow only a little. I was
>> interested about how much is that little ;)
>
> After a fresh boot, it just uses 14 order-2 pages.

I missed to add other information. Even after building the kernel,
it takes 20 order-2 pages. 20 * 4 * 4KB = 320 KB.

Thanks.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Michal Hocko-4
In reply to this post by js1304
On Thu 05-05-16 00:30:35, Joonsoo Kim wrote:
> 2016-05-04 18:21 GMT+09:00 Michal Hocko <[hidden email]>:
[...]

> > Do we really consume 512B of stack during reclaim. That sounds more than
> > worrying to me.
>
> Hmm...I checked it by ./script/stackusage and result is as below.
>
> shrink_zone() 128
> shrink_zone_memcg() 248
> shrink_active_list() 176
>
> We have a call path that shrink_zone() -> shrink_zone_memcg() ->
> shrink_active_list().
> I'm not sure whether it is the deepest path or not.

This is definitely not the deepest path. Slab shrinkers can take more
but 512B is still a lot. Some call paths are already too deep when
calling into the allocator and some of them already use GFP_NOFS to
prevent from potentially deep callchain slab shrinkers. Anyway worth
exploring for better solutions.

And I believe it would be better to solve this in the stackdepot
directly so other users do not have to invent their own ways around the
same issue. I have just checked the code and set_track uses save_stack
which does the same thing and it seems to be called from the slab
allocator. I have missed this usage before so the problem already does
exist. It would be unfair to request you to fix that in order to add a
new user. It would be great if this got addressed though.
--
Michal Hocko
SUSE Labs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Michal Hocko-4
In reply to this post by js1304
On Thu 05-05-16 00:45:45, Joonsoo Kim wrote:

> 2016-05-05 0:30 GMT+09:00 Joonsoo Kim <[hidden email]>:
> > 2016-05-04 18:21 GMT+09:00 Michal Hocko <[hidden email]>:
> >> On Wed 04-05-16 11:14:50, Joonsoo Kim wrote:
> >>> On Tue, May 03, 2016 at 10:53:56AM +0200, Michal Hocko wrote:
> >>> > On Tue 03-05-16 14:23:04, Joonsoo Kim wrote:
> >> [...]
> >>> > > Memory saving looks as following. (Boot 4GB memory system with page_owner)
> >>> > >
> >>> > > 92274688 bytes -> 25165824 bytes
> >>> >
> >>> > It is not clear to me whether this is after a fresh boot or some workload
> >>> > which would grow the stack depot as well. What is a usual cap for the
> >>> > memory consumption.
> >>>
> >>> It is static allocation size after a fresh boot. I didn't add size of
> >>> dynamic allocation memory so it could be larger a little. See below line.
> >>> >
> >>> > > 72% reduction in static allocation size. Even if we should add up size of
> >>> > > dynamic allocation memory, it would not that big because stacktrace is
> >>> > > mostly duplicated.
> >>
> >> This would be true only if most of the allocation stacks are basically
> >> same after the boot which I am not really convinced is true. But you are
> >> right that the number of sublicates will grow only a little. I was
> >> interested about how much is that little ;)
> >
> > After a fresh boot, it just uses 14 order-2 pages.
>
> I missed to add other information. Even after building the kernel,
> it takes 20 order-2 pages. 20 * 4 * 4KB = 320 KB.

Something like that would be useful to mention in the changelog because
measuring right after the fresh boot without any reasonable workload
sounds suspicious.
--
Michal Hocko
SUSE Labs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

js1304
In reply to this post by Michal Hocko-4
2016-05-05 4:40 GMT+09:00 Michal Hocko <[hidden email]>:

> On Thu 05-05-16 00:30:35, Joonsoo Kim wrote:
>> 2016-05-04 18:21 GMT+09:00 Michal Hocko <[hidden email]>:
> [...]
>> > Do we really consume 512B of stack during reclaim. That sounds more than
>> > worrying to me.
>>
>> Hmm...I checked it by ./script/stackusage and result is as below.
>>
>> shrink_zone() 128
>> shrink_zone_memcg() 248
>> shrink_active_list() 176
>>
>> We have a call path that shrink_zone() -> shrink_zone_memcg() ->
>> shrink_active_list().
>> I'm not sure whether it is the deepest path or not.
>
> This is definitely not the deepest path. Slab shrinkers can take more
> but 512B is still a lot. Some call paths are already too deep when
> calling into the allocator and some of them already use GFP_NOFS to
> prevent from potentially deep callchain slab shrinkers. Anyway worth
> exploring for better solutions.
>
> And I believe it would be better to solve this in the stackdepot
> directly so other users do not have to invent their own ways around the
> same issue. I have just checked the code and set_track uses save_stack
> which does the same thing and it seems to be called from the slab
> allocator. I have missed this usage before so the problem already does
> exist. It would be unfair to request you to fix that in order to add a
> new user. It would be great if this got addressed though.

Yes, fixing it in stackdepot looks more reasonable.
Then, I will just change PAGE_OWNER_STACK_DEPTH from 64 to 16 and
leave the code as is for now. With this change, we will just consume 128B stack
and would not cause stack problem. If anyone has an objection,
please let me know.

Thanks.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

Michal Hocko-4
On Tue 10-05-16 16:07:14, Joonsoo Kim wrote:

> 2016-05-05 4:40 GMT+09:00 Michal Hocko <[hidden email]>:
> > On Thu 05-05-16 00:30:35, Joonsoo Kim wrote:
> >> 2016-05-04 18:21 GMT+09:00 Michal Hocko <[hidden email]>:
> > [...]
> >> > Do we really consume 512B of stack during reclaim. That sounds more than
> >> > worrying to me.
> >>
> >> Hmm...I checked it by ./script/stackusage and result is as below.
> >>
> >> shrink_zone() 128
> >> shrink_zone_memcg() 248
> >> shrink_active_list() 176
> >>
> >> We have a call path that shrink_zone() -> shrink_zone_memcg() ->
> >> shrink_active_list().
> >> I'm not sure whether it is the deepest path or not.
> >
> > This is definitely not the deepest path. Slab shrinkers can take more
> > but 512B is still a lot. Some call paths are already too deep when
> > calling into the allocator and some of them already use GFP_NOFS to
> > prevent from potentially deep callchain slab shrinkers. Anyway worth
> > exploring for better solutions.
> >
> > And I believe it would be better to solve this in the stackdepot
> > directly so other users do not have to invent their own ways around the
> > same issue. I have just checked the code and set_track uses save_stack
> > which does the same thing and it seems to be called from the slab
> > allocator. I have missed this usage before so the problem already does
> > exist. It would be unfair to request you to fix that in order to add a
> > new user. It would be great if this got addressed though.
>
> Yes, fixing it in stackdepot looks more reasonable.
> Then, I will just change PAGE_OWNER_STACK_DEPTH from 64 to 16 and
> leave the code as is for now. With this change, we will just consume 128B stack
> and would not cause stack problem. If anyone has an objection,
> please let me know.

128B is still quite a lot but considering there is a plan to make it
more robust I can live with it as a temporary workaround.

--
Michal Hocko
SUSE Labs
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 1/6] mm/compaction: split freepages without holding the zone lock

Vlastimil Babka
In reply to this post by js1304
On 05/03/2016 07:22 AM, [hidden email] wrote:
> From: Joonsoo Kim <[hidden email]>
>
> We don't need to split freepages with holding the zone lock. It will cause
> more contention on zone lock so not desirable.

Fair enough, I just worry about the same thing as Hugh pointed out
recently [1] in that it increases the amount of tricky stuff in
compaction.c doing similar but not quite the same stuff as page/alloc.c,
and which will be forgotten to be updated once somebody updates
prep_new_page with e.g. a new debugging check. Can you perhaps think of
a more robust solution here?

Thanks

[1] http://marc.info/?i=alpine.LSU.2.11.1604270029350.7066%40eggly.anvils%3E

> Signed-off-by: Joonsoo Kim <[hidden email]>
> ---
>   include/linux/mm.h |  1 -
>   mm/compaction.c    | 42 ++++++++++++++++++++++++++++++------------
>   mm/page_alloc.c    | 27 ---------------------------
>   3 files changed, 30 insertions(+), 40 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7b52750..9608f33 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -523,7 +523,6 @@ void __put_page(struct page *page);
>   void put_pages_list(struct list_head *pages);
>  
>   void split_page(struct page *page, unsigned int order);
> -int split_free_page(struct page *page);
>  
>   /*
>    * Compound pages have a destructor function.  Provide a
> diff --git a/mm/compaction.c b/mm/compaction.c
> index c9a95c1..ecf0252 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -65,13 +65,31 @@ static unsigned long release_freepages(struct list_head *freelist)
>  
>   static void map_pages(struct list_head *list)
>   {
> - struct page *page;
> + unsigned int i, order, nr_pages;
> + struct page *page, *next;
> + LIST_HEAD(tmp_list);
> +
> + list_for_each_entry_safe(page, next, list, lru) {
> + list_del(&page->lru);
> +
> + order = page_private(page);
> + nr_pages = 1 << order;
> + set_page_private(page, 0);
> + set_page_refcounted(page);
> +
> + arch_alloc_page(page, order);
> + kernel_map_pages(page, nr_pages, 1);
> + kasan_alloc_pages(page, order);
> + if (order)
> + split_page(page, order);
>  
> - list_for_each_entry(page, list, lru) {
> - arch_alloc_page(page, 0);
> - kernel_map_pages(page, 1, 1);
> - kasan_alloc_pages(page, 0);
> + for (i = 0; i < nr_pages; i++) {
> + list_add(&page->lru, &tmp_list);
> + page++;
> + }
>   }
> +
> + list_splice(&tmp_list, list);
>   }
>  
>   static inline bool migrate_async_suitable(int migratetype)
> @@ -368,12 +386,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>   unsigned long flags = 0;
>   bool locked = false;
>   unsigned long blockpfn = *start_pfn;
> + unsigned int order;
>  
>   cursor = pfn_to_page(blockpfn);
>  
>   /* Isolate free pages. */
>   for (; blockpfn < end_pfn; blockpfn++, cursor++) {
> - int isolated, i;
> + int isolated;
>   struct page *page = cursor;
>  
>   /*
> @@ -439,13 +458,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>   goto isolate_fail;
>   }
>  
> - /* Found a free page, break it into order-0 pages */
> - isolated = split_free_page(page);
> + /* Found a free page, will break it into order-0 pages */
> + order = page_order(page);
> + isolated = __isolate_free_page(page, page_order(page));
> + set_page_private(page, order);
>   total_isolated += isolated;
> - for (i = 0; i < isolated; i++) {
> - list_add(&page->lru, freelist);
> - page++;
> - }
> + list_add_tail(&page->lru, freelist);
>  
>   /* If a page was split, advance to the end of it */
>   if (isolated) {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5dd65d9..60d7f10 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2532,33 +2532,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
>   }
>  
>   /*
> - * Similar to split_page except the page is already free. As this is only
> - * being used for migration, the migratetype of the block also changes.
> - * As this is called with interrupts disabled, the caller is responsible
> - * for calling arch_alloc_page() and kernel_map_page() after interrupts
> - * are enabled.
> - *
> - * Note: this is probably too low level an operation for use in drivers.
> - * Please consult with lkml before using this in your driver.
> - */
> -int split_free_page(struct page *page)
> -{
> - unsigned int order;
> - int nr_pages;
> -
> - order = page_order(page);
> -
> - nr_pages = __isolate_free_page(page, order);
> - if (!nr_pages)
> - return 0;
> -
> - /* Split into individual pages */
> - set_page_refcounted(page);
> - split_page(page, order);
> - return nr_pages;
> -}
> -
> -/*
>    * Update NUMA hit/miss statistics
>    *
>    * Must be called with interrupts disabled.
>

12