kernel: bump 5.15 to 5.15.100
[openwrt/openwrt.git] / target / linux / generic / backport-5.15 / 020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch
1 From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Mon, 5 Apr 2021 04:17:41 -0600
4 Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
5
6 To scan PTEs for accessed pages, a mm_struct list is maintained for
7 each memcg. When multiple threads traverse the same memcg->mm_list,
8 each of them gets a unique mm_struct and therefore they can run
9 walk_page_range() concurrently to reach page tables of all processes
10 of this memcg.
11
12 This infrastructure also provides the following optimizations:
13 1) it allows walkers to skip processes that have been sleeping since
14 the last walk by tracking the usage of mm_struct between context
15 switches.
16 2) it allows walkers to add interesting items they find during a
17 walk to a Bloom filter so that they can skip uninteresting items
18 during the next walk by testing whether an item is in this Bloom
19 filter.
20
21 Signed-off-by: Yu Zhao <yuzhao@google.com>
22 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
23 Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
24 ---
25 fs/exec.c | 2 +
26 include/linux/memcontrol.h | 4 +
27 include/linux/mm_inline.h | 6 +
28 include/linux/mm_types.h | 75 +++++++++
29 include/linux/mmzone.h | 63 +++++++
30 kernel/exit.c | 1 +
31 kernel/fork.c | 9 +
32 kernel/sched/core.c | 1 +
33 mm/memcontrol.c | 25 +++
34 mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++
35 10 files changed, 517 insertions(+)
36
37 --- a/fs/exec.c
38 +++ b/fs/exec.c
39 @@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
40 active_mm = tsk->active_mm;
41 tsk->active_mm = mm;
42 tsk->mm = mm;
43 + lru_gen_add_mm(mm);
44 /*
45 * This prevents preemption while active_mm is being loaded and
46 * it and mm are being updated, which could cause problems for
47 @@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
48 if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
49 local_irq_enable();
50 activate_mm(active_mm, mm);
51 + lru_gen_activate_mm(mm);
52 if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
53 local_irq_enable();
54 tsk->mm->vmacache_seqnum = 0;
55 --- a/include/linux/memcontrol.h
56 +++ b/include/linux/memcontrol.h
57 @@ -348,6 +348,10 @@ struct mem_cgroup {
58 struct deferred_split deferred_split_queue;
59 #endif
60
61 +#ifdef CONFIG_LRU_GEN
62 + struct lru_gen_mm_list mm_list;
63 +#endif
64 +
65 struct mem_cgroup_per_node *nodeinfo[];
66 };
67
68 --- a/include/linux/mm_inline.h
69 +++ b/include/linux/mm_inline.h
70 @@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
71 return seq % MAX_NR_GENS;
72 }
73
74 +/* Return a proper index regardless whether we keep stats for historical generations. */
75 +static inline int lru_hist_from_seq(unsigned long seq)
76 +{
77 + return seq % NR_HIST_GENS;
78 +}
79 +
80 /* The youngest and the second youngest generations are counted as active. */
81 static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
82 {
83 --- a/include/linux/mm_types.h
84 +++ b/include/linux/mm_types.h
85 @@ -3,6 +3,7 @@
86 #define _LINUX_MM_TYPES_H
87
88 #include <linux/mm_types_task.h>
89 +#include <linux/sched.h>
90
91 #include <linux/auxvec.h>
92 #include <linux/list.h>
93 @@ -15,6 +16,8 @@
94 #include <linux/page-flags-layout.h>
95 #include <linux/workqueue.h>
96 #include <linux/seqlock.h>
97 +#include <linux/nodemask.h>
98 +#include <linux/mmdebug.h>
99
100 #include <asm/mmu.h>
101
102 @@ -580,6 +583,18 @@ struct mm_struct {
103 #ifdef CONFIG_IOMMU_SUPPORT
104 u32 pasid;
105 #endif
106 +#ifdef CONFIG_LRU_GEN
107 + struct {
108 + /* the node of a global or per-memcg mm_struct list */
109 + struct list_head list;
110 +#ifdef CONFIG_MEMCG
111 + /* points to the memcg of the owner task above */
112 + struct mem_cgroup *memcg;
113 +#endif
114 + /* whether this mm_struct has been used since the last walk */
115 + nodemask_t nodes;
116 + } lrugen;
117 +#endif /* CONFIG_LRU_GEN */
118 } __randomize_layout;
119
120 /*
121 @@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
122 return (struct cpumask *)&mm->cpu_bitmap;
123 }
124
125 +#ifdef CONFIG_LRU_GEN
126 +
127 +struct lru_gen_mm_list {
128 + /* a global or per-memcg mm_struct list */
129 + struct list_head fifo;
130 + /* protects the list above */
131 + spinlock_t lock;
132 +};
133 +
134 +void lru_gen_add_mm(struct mm_struct *mm);
135 +void lru_gen_del_mm(struct mm_struct *mm);
136 +#ifdef CONFIG_MEMCG
137 +void lru_gen_migrate_mm(struct mm_struct *mm);
138 +#endif
139 +
140 +static inline void lru_gen_init_mm(struct mm_struct *mm)
141 +{
142 + INIT_LIST_HEAD(&mm->lrugen.list);
143 +#ifdef CONFIG_MEMCG
144 + mm->lrugen.memcg = NULL;
145 +#endif
146 + nodes_clear(mm->lrugen.nodes);
147 +}
148 +
149 +/* Track the usage of each mm_struct so that we can skip inactive ones. */
150 +static inline void lru_gen_activate_mm(struct mm_struct *mm)
151 +{
152 + /* unlikely but not a bug when racing with lru_gen_migrate_mm() */
153 + VM_WARN_ON(list_empty(&mm->lrugen.list));
154 +
155 + if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
156 + nodes_setall(mm->lrugen.nodes);
157 +}
158 +
159 +#else /* !CONFIG_LRU_GEN */
160 +
161 +static inline void lru_gen_add_mm(struct mm_struct *mm)
162 +{
163 +}
164 +
165 +static inline void lru_gen_del_mm(struct mm_struct *mm)
166 +{
167 +}
168 +
169 +#ifdef CONFIG_MEMCG
170 +static inline void lru_gen_migrate_mm(struct mm_struct *mm)
171 +{
172 +}
173 +#endif
174 +
175 +static inline void lru_gen_init_mm(struct mm_struct *mm)
176 +{
177 +}
178 +
179 +static inline void lru_gen_activate_mm(struct mm_struct *mm)
180 +{
181 +}
182 +
183 +#endif /* CONFIG_LRU_GEN */
184 +
185 struct mmu_gather;
186 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
187 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
188 --- a/include/linux/mmzone.h
189 +++ b/include/linux/mmzone.h
190 @@ -318,6 +318,13 @@ struct lruvec;
191 #define MIN_NR_GENS 2
192 #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
193
194 +/* Whether to keep stats for historical generations. */
195 +#ifdef CONFIG_LRU_GEN_STATS
196 +#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
197 +#else
198 +#define NR_HIST_GENS 1U
199 +#endif
200 +
201 struct lrugen {
202 /* the aging increments the max generation number */
203 unsigned long max_seq;
204 @@ -333,13 +340,63 @@ struct lrugen {
205 bool enabled[ANON_AND_FILE];
206 };
207
208 +enum {
209 + MM_LEAF_TOTAL, /* total leaf entries */
210 + MM_LEAF_OLD, /* old leaf entries */
211 + MM_LEAF_YOUNG, /* young leaf entries */
212 + MM_NONLEAF_TOTAL, /* total non-leaf entries */
213 + MM_NONLEAF_PREV, /* previously worthy non-leaf entries */
214 + MM_NONLEAF_CUR, /* currently worthy non-leaf entries */
215 + NR_MM_STATS
216 +};
217 +
218 +/* mnemonic codes for the stats above */
219 +#define MM_STAT_CODES "toydpc"
220 +
221 +/* double buffering bloom filters */
222 +#define NR_BLOOM_FILTERS 2
223 +
224 +struct lru_gen_mm_walk {
225 + /* set to max_seq after each round of walk */
226 + unsigned long seq;
227 + /* the next mm_struct on the list to walk */
228 + struct list_head *head;
229 + /* the first mm_struct never walked before */
230 + struct list_head *tail;
231 + /* to wait for the last walker to finish */
232 + struct wait_queue_head wait;
233 + /* bloom filters flip after each round of walk */
234 + unsigned long *filters[NR_BLOOM_FILTERS];
235 + /* page table stats for debugging */
236 + unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
237 + /* the number of concurrent walkers */
238 + int nr_walkers;
239 +};
240 +
241 +#define MIN_BATCH_SIZE 64
242 #define MAX_BATCH_SIZE 8192
243
244 +struct mm_walk_args {
245 + struct mem_cgroup *memcg;
246 + unsigned long max_seq;
247 + unsigned long start_pfn;
248 + unsigned long end_pfn;
249 + unsigned long next_addr;
250 + unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
251 + int node_id;
252 + int swappiness;
253 + int batch_size;
254 + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
255 + int mm_stats[NR_MM_STATS];
256 + bool use_filter;
257 +};
258 +
259 void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
260 void lru_gen_change_state(bool enable, bool main, bool swap);
261
262 #ifdef CONFIG_MEMCG
263 void lru_gen_init_memcg(struct mem_cgroup *memcg);
264 +void lru_gen_free_memcg(struct mem_cgroup *memcg);
265 #endif
266
267 #else /* !CONFIG_LRU_GEN */
268 @@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
269 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
270 {
271 }
272 +
273 +static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
274 +{
275 +}
276 #endif
277
278 #endif /* CONFIG_LRU_GEN */
279 @@ -380,6 +441,8 @@ struct lruvec {
280 #ifdef CONFIG_LRU_GEN
281 /* unevictable pages are on LRU_UNEVICTABLE */
282 struct lrugen evictable;
283 + /* state for mm list and page table walks */
284 + struct lru_gen_mm_walk mm_walk;
285 #endif
286 #ifdef CONFIG_MEMCG
287 struct pglist_data *pgdat;
288 --- a/kernel/exit.c
289 +++ b/kernel/exit.c
290 @@ -469,6 +469,7 @@ assign_new_owner:
291 goto retry;
292 }
293 WRITE_ONCE(mm->owner, c);
294 + lru_gen_migrate_mm(mm);
295 task_unlock(c);
296 put_task_struct(c);
297 }
298 --- a/kernel/fork.c
299 +++ b/kernel/fork.c
300 @@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct
301 goto fail_nocontext;
302
303 mm->user_ns = get_user_ns(user_ns);
304 + lru_gen_init_mm(mm);
305 return mm;
306
307 fail_nocontext:
308 @@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str
309 }
310 if (mm->binfmt)
311 module_put(mm->binfmt->module);
312 + lru_gen_del_mm(mm);
313 mmdrop(mm);
314 }
315
316 @@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_a
317 get_task_struct(p);
318 }
319
320 + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
321 + /* lock the task to synchronize with memcg migration */
322 + task_lock(p);
323 + lru_gen_add_mm(p->mm);
324 + task_unlock(p);
325 + }
326 +
327 wake_up_new_task(p);
328
329 /* forking complete and child started to run, tell ptracer */
330 --- a/kernel/sched/core.c
331 +++ b/kernel/sched/core.c
332 @@ -5007,6 +5007,7 @@ context_switch(struct rq *rq, struct tas
333 * finish_task_switch()'s mmdrop().
334 */
335 switch_mm_irqs_off(prev->active_mm, next->mm, next);
336 + lru_gen_activate_mm(next->mm);
337
338 if (!prev->mm) { // from kernel
339 /* will mmdrop() in finish_task_switch(). */
340 --- a/mm/memcontrol.c
341 +++ b/mm/memcontrol.c
342 @@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
343
344 static void mem_cgroup_free(struct mem_cgroup *memcg)
345 {
346 + lru_gen_free_memcg(memcg);
347 memcg_wb_domain_exit(memcg);
348 __mem_cgroup_free(memcg);
349 }
350 @@ -6210,6 +6211,29 @@ static void mem_cgroup_move_task(void)
351 }
352 #endif
353
354 +#ifdef CONFIG_LRU_GEN
355 +static void mem_cgroup_attach(struct cgroup_taskset *tset)
356 +{
357 + struct cgroup_subsys_state *css;
358 + struct task_struct *task = NULL;
359 +
360 + cgroup_taskset_for_each_leader(task, css, tset)
361 + break;
362 +
363 + if (!task)
364 + return;
365 +
366 + task_lock(task);
367 + if (task->mm && task->mm->owner == task)
368 + lru_gen_migrate_mm(task->mm);
369 + task_unlock(task);
370 +}
371 +#else
372 +static void mem_cgroup_attach(struct cgroup_taskset *tset)
373 +{
374 +}
375 +#endif /* CONFIG_LRU_GEN */
376 +
377 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
378 {
379 if (value == PAGE_COUNTER_MAX)
380 @@ -6553,6 +6577,7 @@ struct cgroup_subsys memory_cgrp_subsys
381 .css_reset = mem_cgroup_css_reset,
382 .css_rstat_flush = mem_cgroup_css_rstat_flush,
383 .can_attach = mem_cgroup_can_attach,
384 + .attach = mem_cgroup_attach,
385 .cancel_attach = mem_cgroup_cancel_attach,
386 .post_attach = mem_cgroup_move_task,
387 .dfl_cftypes = memory_files,
388 --- a/mm/vmscan.c
389 +++ b/mm/vmscan.c
390 @@ -2864,6 +2864,306 @@ static bool __maybe_unused seq_is_valid(
391 }
392
393 /******************************************************************************
394 + * mm_struct list
395 + ******************************************************************************/
396 +
397 +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
398 +{
399 + static struct lru_gen_mm_list mm_list = {
400 + .fifo = LIST_HEAD_INIT(mm_list.fifo),
401 + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
402 + };
403 +
404 +#ifdef CONFIG_MEMCG
405 + if (memcg)
406 + return &memcg->mm_list;
407 +#endif
408 + return &mm_list;
409 +}
410 +
411 +void lru_gen_add_mm(struct mm_struct *mm)
412 +{
413 + int nid;
414 + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
415 + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
416 +
417 + VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
418 +#ifdef CONFIG_MEMCG
419 + VM_BUG_ON_MM(mm->lrugen.memcg, mm);
420 + mm->lrugen.memcg = memcg;
421 +#endif
422 + spin_lock(&mm_list->lock);
423 +
424 + list_add_tail(&mm->lrugen.list, &mm_list->fifo);
425 +
426 + for_each_node(nid) {
427 + struct lruvec *lruvec = get_lruvec(nid, memcg);
428 +
429 + if (!lruvec)
430 + continue;
431 +
432 + if (lruvec->mm_walk.tail == &mm_list->fifo)
433 + lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
434 + }
435 +
436 + spin_unlock(&mm_list->lock);
437 +}
438 +
439 +void lru_gen_del_mm(struct mm_struct *mm)
440 +{
441 + int nid;
442 + struct lru_gen_mm_list *mm_list;
443 + struct mem_cgroup *memcg = NULL;
444 +
445 + if (list_empty(&mm->lrugen.list))
446 + return;
447 +
448 +#ifdef CONFIG_MEMCG
449 + memcg = mm->lrugen.memcg;
450 +#endif
451 + mm_list = get_mm_list(memcg);
452 +
453 + spin_lock(&mm_list->lock);
454 +
455 + for_each_node(nid) {
456 + struct lruvec *lruvec = get_lruvec(nid, memcg);
457 +
458 + if (!lruvec)
459 + continue;
460 +
461 + if (lruvec->mm_walk.tail == &mm->lrugen.list)
462 + lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
463 +
464 + if (lruvec->mm_walk.head != &mm->lrugen.list)
465 + continue;
466 +
467 + lruvec->mm_walk.head = lruvec->mm_walk.head->next;
468 + if (lruvec->mm_walk.head == &mm_list->fifo)
469 + WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
470 + }
471 +
472 + list_del_init(&mm->lrugen.list);
473 +
474 + spin_unlock(&mm_list->lock);
475 +
476 +#ifdef CONFIG_MEMCG
477 + mem_cgroup_put(mm->lrugen.memcg);
478 + mm->lrugen.memcg = NULL;
479 +#endif
480 +}
481 +
482 +#ifdef CONFIG_MEMCG
483 +void lru_gen_migrate_mm(struct mm_struct *mm)
484 +{
485 + struct mem_cgroup *memcg;
486 +
487 + lockdep_assert_held(&mm->owner->alloc_lock);
488 +
489 + if (mem_cgroup_disabled())
490 + return;
491 +
492 + rcu_read_lock();
493 + memcg = mem_cgroup_from_task(mm->owner);
494 + rcu_read_unlock();
495 + if (memcg == mm->lrugen.memcg)
496 + return;
497 +
498 + VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
499 + VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
500 +
501 + lru_gen_del_mm(mm);
502 + lru_gen_add_mm(mm);
503 +}
504 +#endif
505 +
506 +#define BLOOM_FILTER_SHIFT 15
507 +
508 +static inline int filter_gen_from_seq(unsigned long seq)
509 +{
510 + return seq % NR_BLOOM_FILTERS;
511 +}
512 +
513 +static void get_item_key(void *item, int *key)
514 +{
515 + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
516 +
517 + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
518 +
519 + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
520 + key[1] = hash >> BLOOM_FILTER_SHIFT;
521 +}
522 +
523 +static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
524 +{
525 + unsigned long *filter;
526 + int gen = filter_gen_from_seq(seq);
527 +
528 + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
529 +
530 + filter = lruvec->mm_walk.filters[gen];
531 + if (filter) {
532 + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
533 + return;
534 + }
535 +
536 + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
537 + WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
538 +}
539 +
540 +static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
541 +{
542 + int key[2];
543 + unsigned long *filter;
544 + int gen = filter_gen_from_seq(seq);
545 +
546 + filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
547 + if (!filter)
548 + return;
549 +
550 + get_item_key(item, key);
551 +
552 + if (!test_bit(key[0], filter))
553 + set_bit(key[0], filter);
554 + if (!test_bit(key[1], filter))
555 + set_bit(key[1], filter);
556 +}
557 +
558 +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
559 +{
560 + int key[2];
561 + unsigned long *filter;
562 + int gen = filter_gen_from_seq(seq);
563 +
564 + filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
565 + if (!filter)
566 + return false;
567 +
568 + get_item_key(item, key);
569 +
570 + return test_bit(key[0], filter) && test_bit(key[1], filter);
571 +}
572 +
573 +static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
574 +{
575 + int i;
576 + int hist = lru_hist_from_seq(args->max_seq);
577 +
578 + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
579 +
580 + for (i = 0; i < NR_MM_STATS; i++) {
581 + WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
582 + lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
583 + args->mm_stats[i] = 0;
584 + }
585 +
586 + if (!last || NR_HIST_GENS == 1)
587 + return;
588 +
589 + hist = lru_hist_from_seq(args->max_seq + 1);
590 + for (i = 0; i < NR_MM_STATS; i++)
591 + WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
592 +}
593 +
594 +static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
595 +{
596 + int type;
597 + unsigned long size = 0;
598 +
599 + if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
600 + return true;
601 +
602 + if (mm_is_oom_victim(mm))
603 + return true;
604 +
605 + for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
606 + size += type ? get_mm_counter(mm, MM_FILEPAGES) :
607 + get_mm_counter(mm, MM_ANONPAGES) +
608 + get_mm_counter(mm, MM_SHMEMPAGES);
609 + }
610 +
611 + if (size < MIN_BATCH_SIZE)
612 + return true;
613 +
614 + if (!mmget_not_zero(mm))
615 + return true;
616 +
617 + node_clear(args->node_id, mm->lrugen.nodes);
618 +
619 + return false;
620 +}
621 +
622 +/* To support multiple walkers that concurrently walk an mm_struct list. */
623 +static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
624 + struct mm_struct **iter)
625 +{
626 + bool first = false;
627 + bool last = true;
628 + struct mm_struct *mm = NULL;
629 + struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
630 + struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
631 +
632 + if (*iter)
633 + mmput_async(*iter);
634 + else if (args->max_seq <= READ_ONCE(mm_walk->seq))
635 + return false;
636 +
637 + spin_lock(&mm_list->lock);
638 +
639 + VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
640 + VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
641 + VM_BUG_ON(*iter && !mm_walk->nr_walkers);
642 +
643 + if (args->max_seq <= mm_walk->seq) {
644 + if (!*iter)
645 + last = false;
646 + goto done;
647 + }
648 +
649 + if (mm_walk->head == &mm_list->fifo) {
650 + VM_BUG_ON(mm_walk->nr_walkers);
651 + mm_walk->head = mm_walk->head->next;
652 + first = true;
653 + }
654 +
655 + while (!mm && mm_walk->head != &mm_list->fifo) {
656 + mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
657 +
658 + mm_walk->head = mm_walk->head->next;
659 +
660 + if (mm_walk->tail == &mm->lrugen.list) {
661 + mm_walk->tail = mm_walk->tail->next;
662 + args->use_filter = false;
663 + }
664 +
665 + if (should_skip_mm(mm, args))
666 + mm = NULL;
667 + }
668 +
669 + if (mm_walk->head == &mm_list->fifo)
670 + WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
671 +done:
672 + if (*iter && !mm)
673 + mm_walk->nr_walkers--;
674 + if (!*iter && mm)
675 + mm_walk->nr_walkers++;
676 +
677 + if (mm_walk->nr_walkers)
678 + last = false;
679 +
680 + if (mm && first)
681 + clear_bloom_filter(lruvec, args->max_seq + 1);
682 +
683 + if (*iter || last)
684 + reset_mm_stats(lruvec, last, args);
685 +
686 + spin_unlock(&mm_list->lock);
687 +
688 + *iter = mm;
689 +
690 + return last;
691 +}
692 +
693 +/******************************************************************************
694 * state change
695 ******************************************************************************/
696
697 @@ -3047,6 +3347,7 @@ void lru_gen_init_state(struct mem_cgrou
698 int i;
699 int gen, type, zone;
700 struct lrugen *lrugen = &lruvec->evictable;
701 + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
702
703 lrugen->max_seq = MIN_NR_GENS + 1;
704 lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
705 @@ -3057,6 +3358,17 @@ void lru_gen_init_state(struct mem_cgrou
706
707 for_each_gen_type_zone(gen, type, zone)
708 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
709 +
710 + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
711 + spin_lock(&mm_list->lock);
712 +
713 + lruvec->mm_walk.seq = MIN_NR_GENS;
714 + lruvec->mm_walk.head = &mm_list->fifo;
715 + lruvec->mm_walk.tail = &mm_list->fifo;
716 + init_waitqueue_head(&lruvec->mm_walk.wait);
717 +
718 + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
719 + spin_unlock(&mm_list->lock);
720 }
721
722 #ifdef CONFIG_MEMCG
723 @@ -3064,18 +3376,37 @@ void lru_gen_init_memcg(struct mem_cgrou
724 {
725 int nid;
726
727 + INIT_LIST_HEAD(&memcg->mm_list.fifo);
728 + spin_lock_init(&memcg->mm_list.lock);
729 +
730 for_each_node(nid) {
731 struct lruvec *lruvec = get_lruvec(nid, memcg);
732
733 lru_gen_init_state(memcg, lruvec);
734 }
735 }
736 +
737 +void lru_gen_free_memcg(struct mem_cgroup *memcg)
738 +{
739 + int nid;
740 +
741 + for_each_node(nid) {
742 + int i;
743 + struct lruvec *lruvec = get_lruvec(nid, memcg);
744 +
745 + for (i = 0; i < NR_BLOOM_FILTERS; i++) {
746 + bitmap_free(lruvec->mm_walk.filters[i]);
747 + lruvec->mm_walk.filters[i] = NULL;
748 + }
749 + }
750 +}
751 #endif
752
753 static int __init init_lru_gen(void)
754 {
755 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
756 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
757 + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
758
759 return 0;
760 };