1 From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Mon, 25 Jan 2021 21:38:02 -0700
4 Subject: [PATCH 08/10] mm: multigenerational lru: user interface
6 Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
7 multigenerational lru at runtime.
9 Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
10 given number of milliseconds. The OOM killer is invoked if this
11 working set cannot be kept in memory.
13 Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
14 invoke the aging and the eviction. This file has the following output:
15 memcg memcg_id memcg_path
17 min_gen birth_time anon_size file_size
19 max_gen birth_time anon_size file_size
21 min_gen is the oldest generation number and max_gen is the youngest
22 generation number. birth_time is in milliseconds. anon_size and
23 file_size are in pages.
25 This file takes the following input:
26 + memcg_id node_id max_gen [swappiness] [use_bloom_filter]
27 - memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
29 The first command line invokes the aging, which scans PTEs for
30 accessed pages and then creates the next generation max_gen+1. A swap
31 file and a non-zero swappiness, which overrides vm.swappiness, are
32 required to scan PTEs mapping anon pages. The second command line
33 invokes the eviction, which evicts generations less than or equal to
34 min_gen. min_gen should be less than max_gen-1 as max_gen and
35 max_gen-1 are not fully aged and therefore cannot be evicted.
36 Setting nr_to_reclaim to N limits the number of pages to evict.
37 Setting use_bloom_filter to 0 overrides the default behavior which
38 only scans PTE tables found populated. Multiple command lines are
39 supported, as is concatenation with delimiters "," and ";".
41 Signed-off-by: Yu Zhao <yuzhao@google.com>
42 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
43 Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
45 include/linux/nodemask.h | 1 +
46 mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++
47 2 files changed, 416 insertions(+)
49 --- a/include/linux/nodemask.h
50 +++ b/include/linux/nodemask.h
51 @@ -485,6 +485,7 @@ static inline int num_node_state(enum no
52 #define first_online_node 0
53 #define first_memory_node 0
54 #define next_online_node(nid) (MAX_NUMNODES)
55 +#define next_memory_node(nid) (MAX_NUMNODES)
56 #define nr_node_ids 1U
57 #define nr_online_nodes 1U
62 #include <linux/memory.h>
63 #include <linux/pagewalk.h>
64 #include <linux/shmem_fs.h>
65 +#include <linux/ctype.h>
66 +#include <linux/debugfs.h>
68 #include <asm/tlbflush.h>
69 #include <asm/div64.h>
70 @@ -4882,6 +4884,413 @@ unlock:
73 /******************************************************************************
75 + ******************************************************************************/
77 +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
79 + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
82 +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
83 + const char *buf, size_t len)
87 + if (kstrtouint(buf, 10, &msecs))
90 + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
95 +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
96 + min_ttl_ms, 0644, show_min_ttl, store_min_ttl
99 +static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
101 + return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
104 +static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
105 + const char *buf, size_t len)
109 + if (kstrtobool(buf, &enable))
112 + lru_gen_change_state(enable, true, false);
117 +static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
118 + enabled, 0644, show_enable, store_enable
121 +static struct attribute *lru_gen_attrs[] = {
122 + &lru_gen_min_ttl_attr.attr,
123 + &lru_gen_enabled_attr.attr,
127 +static struct attribute_group lru_gen_attr_group = {
129 + .attrs = lru_gen_attrs,
132 +/******************************************************************************
133 + * debugfs interface
134 + ******************************************************************************/
136 +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
138 + struct mem_cgroup *memcg;
139 + loff_t nr_to_skip = *pos;
141 + m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
143 + return ERR_PTR(-ENOMEM);
145 + memcg = mem_cgroup_iter(NULL, NULL, NULL);
149 + for_each_node_state(nid, N_MEMORY) {
151 + return get_lruvec(nid, memcg);
153 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
158 +static void lru_gen_seq_stop(struct seq_file *m, void *v)
160 + if (!IS_ERR_OR_NULL(v))
161 + mem_cgroup_iter_break(NULL, lruvec_memcg(v));
163 + kvfree(m->private);
167 +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
169 + int nid = lruvec_pgdat(v)->node_id;
170 + struct mem_cgroup *memcg = lruvec_memcg(v);
174 + nid = next_memory_node(nid);
175 + if (nid == MAX_NUMNODES) {
176 + memcg = mem_cgroup_iter(NULL, memcg, NULL);
180 + nid = first_memory_node;
183 + return get_lruvec(nid, memcg);
186 +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
187 + unsigned long max_seq, unsigned long *min_seq,
192 + int hist = lru_hist_from_seq(seq);
193 + struct lrugen *lrugen = &lruvec->evictable;
195 + for (tier = 0; tier < MAX_NR_TIERS; tier++) {
196 + seq_printf(m, " %10d", tier);
197 + for (type = 0; type < ANON_AND_FILE; type++) {
198 + unsigned long n[3] = {};
200 + if (seq == max_seq) {
201 + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
202 + n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
204 + seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
205 + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
206 + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
207 + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
209 + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
211 + seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
213 + seq_puts(m, " 0 0 0 ");
219 + for (i = 0; i < NR_MM_STATS; i++) {
220 + if (seq == max_seq && NR_HIST_GENS == 1)
221 + seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
222 + toupper(MM_STAT_CODES[i]));
223 + else if (seq != max_seq && NR_HIST_GENS > 1)
224 + seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
227 + seq_puts(m, " 0 ");
232 +static int lru_gen_seq_show(struct seq_file *m, void *v)
235 + bool full = !debugfs_real_fops(m->file)->write;
236 + struct lruvec *lruvec = v;
237 + struct lrugen *lrugen = &lruvec->evictable;
238 + int nid = lruvec_pgdat(lruvec)->node_id;
239 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
240 + DEFINE_MAX_SEQ(lruvec);
241 + DEFINE_MIN_SEQ(lruvec);
243 + if (nid == first_memory_node) {
244 + const char *path = memcg ? m->private : "";
248 + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
250 + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
253 + seq_printf(m, " node %5d\n", nid);
257 + else if (max_seq >= MAX_NR_GENS)
258 + seq = max_seq - MAX_NR_GENS + 1;
262 + for (; seq <= max_seq; seq++) {
263 + int gen, type, zone;
264 + unsigned int msecs;
266 + gen = lru_gen_from_seq(seq);
267 + msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
269 + seq_printf(m, " %10lu %10u", seq, msecs);
271 + for (type = 0; type < ANON_AND_FILE; type++) {
274 + if (seq < min_seq[type]) {
275 + seq_puts(m, " -0 ");
279 + for (zone = 0; zone < MAX_NR_ZONES; zone++)
280 + size += READ_ONCE(lrugen->sizes[gen][type][zone]);
282 + seq_printf(m, " %10lu ", max(size, 0L));
288 + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
294 +static const struct seq_operations lru_gen_seq_ops = {
295 + .start = lru_gen_seq_start,
296 + .stop = lru_gen_seq_stop,
297 + .next = lru_gen_seq_next,
298 + .show = lru_gen_seq_show,
301 +static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
302 + unsigned long seq, bool use_filter)
304 + DEFINE_MAX_SEQ(lruvec);
306 + if (seq == max_seq)
307 + try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
309 + return seq > max_seq ? -EINVAL : 0;
312 +static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
313 + unsigned long seq, unsigned long nr_to_reclaim)
315 + struct blk_plug plug;
317 + DEFINE_MAX_SEQ(lruvec);
319 + if (seq >= max_seq - 1)
322 + sc->nr_reclaimed = 0;
324 + blk_start_plug(&plug);
326 + while (!signal_pending(current)) {
327 + DEFINE_MIN_SEQ(lruvec);
329 + if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
330 + !evict_pages(lruvec, sc, swappiness)) {
338 + blk_finish_plug(&plug);
343 +static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
344 + int swappiness, unsigned long seq, unsigned long opt)
346 + struct lruvec *lruvec;
348 + struct mem_cgroup *memcg = NULL;
350 + if (!mem_cgroup_disabled()) {
352 + memcg = mem_cgroup_from_id(memcg_id);
354 + if (memcg && !css_tryget(&memcg->css))
362 + if (memcg_id != mem_cgroup_id(memcg))
365 + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
368 + lruvec = get_lruvec(nid, memcg);
370 + if (swappiness < 0)
371 + swappiness = get_swappiness(memcg);
372 + else if (swappiness > 200)
377 + err = run_aging(lruvec, sc, swappiness, seq, opt);
380 + err = run_eviction(lruvec, sc, swappiness, seq, opt);
384 + mem_cgroup_put(memcg);
389 +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
390 + size_t len, loff_t *pos)
394 + unsigned int flags;
396 + struct scan_control sc = {
397 + .may_writepage = 1,
400 + .reclaim_idx = MAX_NR_ZONES - 1,
401 + .gfp_mask = GFP_KERNEL,
404 + buf = kvmalloc(len + 1, GFP_KERNEL);
408 + if (copy_from_user(buf, src, len)) {
416 + sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
417 + if (!sc.reclaim_state.mm_walk_args) {
422 + flags = memalloc_noreclaim_save();
423 + set_task_reclaim_state(current, &sc.reclaim_state);
425 + while ((cur = strsep(&next, ",;\n"))) {
429 + unsigned int memcg_id;
432 + unsigned int swappiness = -1;
433 + unsigned long opt = -1;
435 + cur = skip_spaces(cur);
439 + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
440 + &seq, &end, &swappiness, &end, &opt, &end);
441 + if (n < 4 || cur[end]) {
446 + err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
451 + set_task_reclaim_state(current, NULL);
452 + memalloc_noreclaim_restore(flags);
454 + free_mm_walk_args(sc.reclaim_state.mm_walk_args);
457 + return err ? : len;
460 +static int lru_gen_seq_open(struct inode *inode, struct file *file)
462 + return seq_open(file, &lru_gen_seq_ops);
465 +static const struct file_operations lru_gen_rw_fops = {
466 + .open = lru_gen_seq_open,
468 + .write = lru_gen_seq_write,
469 + .llseek = seq_lseek,
470 + .release = seq_release,
473 +static const struct file_operations lru_gen_ro_fops = {
474 + .open = lru_gen_seq_open,
476 + .llseek = seq_lseek,
477 + .release = seq_release,
480 +/******************************************************************************
482 ******************************************************************************/
484 @@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void)
485 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
486 BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
488 + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
489 + pr_err("lru_gen: failed to create sysfs group\n");
491 + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
492 + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
496 late_initcall(init_lru_gen);