kernel: Update MGLRU patchset
[openwrt/openwrt.git] / target / linux / generic / backport-5.15 / 020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch
1 From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:19:04 -0700
4 Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
5
6 For each node, memcgs are divided into two generations: the old and
7 the young. For each generation, memcgs are randomly sharded into
8 multiple bins to improve scalability. For each bin, an RCU hlist_nulls
9 is virtually divided into three segments: the head, the tail and the
10 default.
11
12 An onlining memcg is added to the tail of a random bin in the old
13 generation. The eviction starts at the head of a random bin in the old
14 generation. The per-node memcg generation counter, whose reminder (mod
15 2) indexes the old generation, is incremented when all its bins become
16 empty.
17
18 There are four operations:
19 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
20 its current generation (old or young) and updates its "seg" to
21 "head";
22 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
23 its current generation (old or young) and updates its "seg" to
24 "tail";
25 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
26 the old generation, updates its "gen" to "old" and resets its "seg"
27 to "default";
28 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
29 in the young generation, updates its "gen" to "young" and resets
30 its "seg" to "default".
31
32 The events that trigger the above operations are:
33 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
34 2. The first attempt to reclaim an memcg below low, which triggers
35 MEMCG_LRU_TAIL;
36 3. The first attempt to reclaim an memcg below reclaimable size
37 threshold, which triggers MEMCG_LRU_TAIL;
38 4. The second attempt to reclaim an memcg below reclaimable size
39 threshold, which triggers MEMCG_LRU_YOUNG;
40 5. Attempting to reclaim an memcg below min, which triggers
41 MEMCG_LRU_YOUNG;
42 6. Finishing the aging on the eviction path, which triggers
43 MEMCG_LRU_YOUNG;
44 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
45
46 Note that memcg LRU only applies to global reclaim, and the
47 round-robin incrementing of their max_seq counters ensures the
48 eventual fairness to all eligible memcgs. For memcg reclaim, it still
49 relies on mem_cgroup_iter().
50
51 Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
52 Signed-off-by: Yu Zhao <yuzhao@google.com>
53 Cc: Johannes Weiner <hannes@cmpxchg.org>
54 Cc: Jonathan Corbet <corbet@lwn.net>
55 Cc: Michael Larabel <Michael@MichaelLarabel.com>
56 Cc: Michal Hocko <mhocko@kernel.org>
57 Cc: Mike Rapoport <rppt@kernel.org>
58 Cc: Roman Gushchin <roman.gushchin@linux.dev>
59 Cc: Suren Baghdasaryan <surenb@google.com>
60 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
61 ---
62 include/linux/memcontrol.h | 10 +
63 include/linux/mm_inline.h | 17 ++
64 include/linux/mmzone.h | 117 +++++++++++-
65 mm/memcontrol.c | 16 ++
66 mm/page_alloc.c | 1 +
67 mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++----
68 6 files changed, 499 insertions(+), 35 deletions(-)
69
70 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
71 index 3736405cbcf6..2e405fd88846 100644
72 --- a/include/linux/memcontrol.h
73 +++ b/include/linux/memcontrol.h
74 @@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
75 percpu_ref_put(&objcg->refcnt);
76 }
77
78 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
79 +{
80 + return !memcg || css_tryget(&memcg->css);
81 +}
82 +
83 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
84 {
85 if (memcg)
86 @@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
87 return NULL;
88 }
89
90 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
91 +{
92 + return true;
93 +}
94 +
95 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
96 {
97 }
98 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
99 index 4adc9ba59569..9138c2e638ce 100644
100 --- a/include/linux/mm_inline.h
101 +++ b/include/linux/mm_inline.h
102 @@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void)
103 return current->in_lru_fault;
104 }
105
106 +#ifdef CONFIG_MEMCG
107 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
108 +{
109 + return READ_ONCE(lruvec->lrugen.seg);
110 +}
111 +#else
112 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
113 +{
114 + return 0;
115 +}
116 +#endif
117 +
118 static inline int lru_gen_from_seq(unsigned long seq)
119 {
120 return seq % MAX_NR_GENS;
121 @@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void)
122 return false;
123 }
124
125 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
126 +{
127 + return 0;
128 +}
129 +
130 static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
131 {
132 return false;
133 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
134 index 7b8a26aaf381..4bbf191517e2 100644
135 --- a/include/linux/mmzone.h
136 +++ b/include/linux/mmzone.h
137 @@ -7,6 +7,7 @@
138
139 #include <linux/spinlock.h>
140 #include <linux/list.h>
141 +#include <linux/list_nulls.h>
142 #include <linux/wait.h>
143 #include <linux/bitops.h>
144 #include <linux/cache.h>
145 @@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
146 #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
147 #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
148
149 +/* see the comment on MEMCG_NR_GENS */
150 +enum {
151 + MEMCG_LRU_NOP,
152 + MEMCG_LRU_HEAD,
153 + MEMCG_LRU_TAIL,
154 + MEMCG_LRU_OLD,
155 + MEMCG_LRU_YOUNG,
156 +};
157 +
158 #ifdef CONFIG_LRU_GEN
159
160 enum {
161 @@ -416,6 +426,14 @@ struct lru_gen_page {
162 atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
163 /* whether the multi-gen LRU is enabled */
164 bool enabled;
165 +#ifdef CONFIG_MEMCG
166 + /* the memcg generation this lru_gen_page belongs to */
167 + u8 gen;
168 + /* the list segment this lru_gen_page belongs to */
169 + u8 seg;
170 + /* per-node lru_gen_page list for global reclaim */
171 + struct hlist_nulls_node list;
172 +#endif
173 };
174
175 enum {
176 @@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
177 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
178
179 #ifdef CONFIG_MEMCG
180 +
181 +/*
182 + * For each node, memcgs are divided into two generations: the old and the
183 + * young. For each generation, memcgs are randomly sharded into multiple bins
184 + * to improve scalability. For each bin, the hlist_nulls is virtually divided
185 + * into three segments: the head, the tail and the default.
186 + *
187 + * An onlining memcg is added to the tail of a random bin in the old generation.
188 + * The eviction starts at the head of a random bin in the old generation. The
189 + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
190 + * the old generation, is incremented when all its bins become empty.
191 + *
192 + * There are four operations:
193 + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
194 + * current generation (old or young) and updates its "seg" to "head";
195 + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
196 + * current generation (old or young) and updates its "seg" to "tail";
197 + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
198 + * generation, updates its "gen" to "old" and resets its "seg" to "default";
199 + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
200 + * young generation, updates its "gen" to "young" and resets its "seg" to
201 + * "default".
202 + *
203 + * The events that trigger the above operations are:
204 + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
205 + * 2. The first attempt to reclaim an memcg below low, which triggers
206 + * MEMCG_LRU_TAIL;
207 + * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
208 + * which triggers MEMCG_LRU_TAIL;
209 + * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
210 + * which triggers MEMCG_LRU_YOUNG;
211 + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
212 + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
213 + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
214 + *
215 + * Note that memcg LRU only applies to global reclaim, and the round-robin
216 + * incrementing of their max_seq counters ensures the eventual fairness to all
217 + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
218 + */
219 +#define MEMCG_NR_GENS 2
220 +#define MEMCG_NR_BINS 8
221 +
222 +struct lru_gen_memcg {
223 + /* the per-node memcg generation counter */
224 + unsigned long seq;
225 + /* each memcg has one lru_gen_page per node */
226 + unsigned long nr_memcgs[MEMCG_NR_GENS];
227 + /* per-node lru_gen_page list for global reclaim */
228 + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
229 + /* protects the above */
230 + spinlock_t lock;
231 +};
232 +
233 +void lru_gen_init_pgdat(struct pglist_data *pgdat);
234 +
235 void lru_gen_init_memcg(struct mem_cgroup *memcg);
236 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
237 -#endif
238 +void lru_gen_online_memcg(struct mem_cgroup *memcg);
239 +void lru_gen_offline_memcg(struct mem_cgroup *memcg);
240 +void lru_gen_release_memcg(struct mem_cgroup *memcg);
241 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
242 +
243 +#else /* !CONFIG_MEMCG */
244 +
245 +#define MEMCG_NR_GENS 1
246 +
247 +struct lru_gen_memcg {
248 +};
249 +
250 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
251 +{
252 +}
253 +
254 +#endif /* CONFIG_MEMCG */
255
256 #else /* !CONFIG_LRU_GEN */
257
258 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
259 +{
260 +}
261 +
262 static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
263 {
264 }
265 @@ -484,6 +577,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
266 }
267
268 #ifdef CONFIG_MEMCG
269 +
270 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
271 {
272 }
273 @@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
274 static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
275 {
276 }
277 -#endif
278 +
279 +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
280 +{
281 +}
282 +
283 +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
284 +{
285 +}
286 +
287 +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
288 +{
289 +}
290 +
291 +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
292 +{
293 +}
294 +
295 +#endif /* CONFIG_MEMCG */
296
297 #endif /* CONFIG_LRU_GEN */
298
299 @@ -1105,6 +1216,8 @@ typedef struct pglist_data {
300 #ifdef CONFIG_LRU_GEN
301 /* kswap mm walk data */
302 struct lru_gen_mm_walk mm_walk;
303 + /* lru_gen_page list */
304 + struct lru_gen_memcg memcg_lru;
305 #endif
306
307 ZONE_PADDING(_pad2_)
308 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
309 index ed87d1256f0e..172adfbee06e 100644
310 --- a/mm/memcontrol.c
311 +++ b/mm/memcontrol.c
312 @@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
313 struct mem_cgroup_per_node *mz;
314 struct mem_cgroup_tree_per_node *mctz;
315
316 + if (lru_gen_enabled()) {
317 + struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
318 +
319 + /* see the comment on MEMCG_NR_GENS */
320 + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
321 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
322 +
323 + return;
324 + }
325 +
326 mctz = soft_limit_tree_from_page(page);
327 if (!mctz)
328 return;
329 @@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
330 unsigned long excess;
331 unsigned long nr_scanned;
332
333 + if (lru_gen_enabled())
334 + return 0;
335 +
336 if (order > 0)
337 return 0;
338
339 @@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
340 if (unlikely(mem_cgroup_is_root(memcg)))
341 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
342 2UL*HZ);
343 + lru_gen_online_memcg(memcg);
344 return 0;
345 }
346
347 @@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
348 memcg_offline_kmem(memcg);
349 reparent_shrinker_deferred(memcg);
350 wb_memcg_offline(memcg);
351 + lru_gen_offline_memcg(memcg);
352
353 drain_all_stock(memcg);
354
355 @@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
356 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
357
358 invalidate_reclaim_iterators(memcg);
359 + lru_gen_release_memcg(memcg);
360 }
361
362 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
363 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
364 index c929357fbefe..6459d9c018be 100644
365 --- a/mm/page_alloc.c
366 +++ b/mm/page_alloc.c
367 @@ -7645,6 +7645,7 @@ static void __init free_area_init_node(int nid)
368 pgdat_set_deferred_range(pgdat);
369
370 free_area_init_core(pgdat);
371 + lru_gen_init_pgdat(pgdat);
372 }
373
374 void __init free_area_init_memoryless_node(int nid)
375 diff --git a/mm/vmscan.c b/mm/vmscan.c
376 index cb026e2714d7..3d8e0665186c 100644
377 --- a/mm/vmscan.c
378 +++ b/mm/vmscan.c
379 @@ -54,6 +54,8 @@
380 #include <linux/shmem_fs.h>
381 #include <linux/ctype.h>
382 #include <linux/debugfs.h>
383 +#include <linux/rculist_nulls.h>
384 +#include <linux/random.h>
385
386 #include <asm/tlbflush.h>
387 #include <asm/div64.h>
388 @@ -129,11 +131,6 @@ struct scan_control {
389 /* Always discard instead of demoting to lower tier memory */
390 unsigned int no_demotion:1;
391
392 -#ifdef CONFIG_LRU_GEN
393 - /* help kswapd make better choices among multiple memcgs */
394 - unsigned long last_reclaimed;
395 -#endif
396 -
397 /* Allocation order */
398 s8 order;
399
400 @@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
401 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
402 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
403
404 +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
405 +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
406 +
407 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
408 {
409 struct pglist_data *pgdat = NODE_DATA(nid);
410 @@ -4169,8 +4169,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
411 if (sc->priority <= DEF_PRIORITY - 2)
412 wait_event_killable(lruvec->mm_state.wait,
413 max_seq < READ_ONCE(lrugen->max_seq));
414 -
415 - return max_seq < READ_ONCE(lrugen->max_seq);
416 + return false;
417 }
418
419 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
420 @@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
421
422 VM_WARN_ON_ONCE(!current_is_kswapd());
423
424 - sc->last_reclaimed = sc->nr_reclaimed;
425 -
426 /* check the order to exclude compaction-induced reclaim */
427 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
428 return;
429 @@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
430 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
431 * reclaim.
432 */
433 -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
434 - bool can_swap)
435 +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
436 {
437 unsigned long nr_to_scan;
438 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
439 @@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
440 if (sc->priority == DEF_PRIORITY)
441 return nr_to_scan;
442
443 - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
444 -
445 /* skip this lruvec as it's low on cold pages */
446 - return 0;
447 + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
448 }
449
450 static unsigned long get_nr_to_reclaim(struct scan_control *sc)
451 @@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
452 if (!global_reclaim(sc))
453 return -1;
454
455 - /* discount the previous progress for kswapd */
456 - if (current_is_kswapd())
457 - return sc->nr_to_reclaim + sc->last_reclaimed;
458 -
459 return max(sc->nr_to_reclaim, compact_gap(sc->order));
460 }
461
462 -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
463 +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
464 {
465 - struct blk_plug plug;
466 + long nr_to_scan;
467 unsigned long scanned = 0;
468 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
469
470 - lru_add_drain();
471 -
472 - blk_start_plug(&plug);
473 -
474 - set_mm_walk(lruvec_pgdat(lruvec));
475 -
476 while (true) {
477 int delta;
478 int swappiness;
479 - unsigned long nr_to_scan;
480
481 if (sc->may_swap)
482 swappiness = get_swappiness(lruvec, sc);
483 @@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
484 swappiness = 0;
485
486 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
487 - if (!nr_to_scan)
488 + if (nr_to_scan <= 0)
489 break;
490
491 delta = evict_pages(lruvec, sc, swappiness);
492 @@ -4912,11 +4895,251 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
493 cond_resched();
494 }
495
496 + /* whether try_to_inc_max_seq() was successful */
497 + return nr_to_scan < 0;
498 +}
499 +
500 +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
501 +{
502 + bool success;
503 + unsigned long scanned = sc->nr_scanned;
504 + unsigned long reclaimed = sc->nr_reclaimed;
505 + int seg = lru_gen_memcg_seg(lruvec);
506 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
507 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
508 +
509 + /* see the comment on MEMCG_NR_GENS */
510 + if (!lruvec_is_sizable(lruvec, sc))
511 + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
512 +
513 + mem_cgroup_calculate_protection(NULL, memcg);
514 +
515 + if (mem_cgroup_below_min(memcg))
516 + return MEMCG_LRU_YOUNG;
517 +
518 + if (mem_cgroup_below_low(memcg)) {
519 + /* see the comment on MEMCG_NR_GENS */
520 + if (seg != MEMCG_LRU_TAIL)
521 + return MEMCG_LRU_TAIL;
522 +
523 + memcg_memory_event(memcg, MEMCG_LOW);
524 + }
525 +
526 + success = try_to_shrink_lruvec(lruvec, sc);
527 +
528 + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
529 +
530 + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
531 + sc->nr_reclaimed - reclaimed);
532 +
533 + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
534 + current->reclaim_state->reclaimed_slab = 0;
535 +
536 + return success ? MEMCG_LRU_YOUNG : 0;
537 +}
538 +
539 +#ifdef CONFIG_MEMCG
540 +
541 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
542 +{
543 + int gen;
544 + int bin;
545 + int first_bin;
546 + struct lruvec *lruvec;
547 + struct lru_gen_page *lrugen;
548 + const struct hlist_nulls_node *pos;
549 + int op = 0;
550 + struct mem_cgroup *memcg = NULL;
551 + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
552 +
553 + bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
554 +restart:
555 + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
556 +
557 + rcu_read_lock();
558 +
559 + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
560 + if (op)
561 + lru_gen_rotate_memcg(lruvec, op);
562 +
563 + mem_cgroup_put(memcg);
564 +
565 + lruvec = container_of(lrugen, struct lruvec, lrugen);
566 + memcg = lruvec_memcg(lruvec);
567 +
568 + if (!mem_cgroup_tryget(memcg)) {
569 + op = 0;
570 + memcg = NULL;
571 + continue;
572 + }
573 +
574 + rcu_read_unlock();
575 +
576 + op = shrink_one(lruvec, sc);
577 +
578 + if (sc->nr_reclaimed >= nr_to_reclaim)
579 + goto success;
580 +
581 + rcu_read_lock();
582 + }
583 +
584 + rcu_read_unlock();
585 +
586 + /* restart if raced with lru_gen_rotate_memcg() */
587 + if (gen != get_nulls_value(pos))
588 + goto restart;
589 +
590 + /* try the rest of the bins of the current generation */
591 + bin = get_memcg_bin(bin + 1);
592 + if (bin != first_bin)
593 + goto restart;
594 +success:
595 + if (op)
596 + lru_gen_rotate_memcg(lruvec, op);
597 +
598 + mem_cgroup_put(memcg);
599 +}
600 +
601 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
602 +{
603 + struct blk_plug plug;
604 +
605 + VM_WARN_ON_ONCE(global_reclaim(sc));
606 +
607 + lru_add_drain();
608 +
609 + blk_start_plug(&plug);
610 +
611 + set_mm_walk(lruvec_pgdat(lruvec));
612 +
613 + if (try_to_shrink_lruvec(lruvec, sc))
614 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
615 +
616 + clear_mm_walk();
617 +
618 + blk_finish_plug(&plug);
619 +}
620 +
621 +#else /* !CONFIG_MEMCG */
622 +
623 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
624 +{
625 + BUILD_BUG();
626 +}
627 +
628 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
629 +{
630 + BUILD_BUG();
631 +}
632 +
633 +#endif
634 +
635 +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
636 +{
637 + int priority;
638 + unsigned long reclaimable;
639 + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
640 +
641 + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
642 + return;
643 + /*
644 + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
645 + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
646 + * estimated reclaimed_to_scanned_ratio = inactive / total.
647 + */
648 + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
649 + if (get_swappiness(lruvec, sc))
650 + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
651 +
652 + reclaimable /= MEMCG_NR_GENS;
653 +
654 + /* round down reclaimable and round up sc->nr_to_reclaim */
655 + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
656 +
657 + sc->priority = clamp(priority, 0, DEF_PRIORITY);
658 +}
659 +
660 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
661 +{
662 + struct blk_plug plug;
663 + unsigned long reclaimed = sc->nr_reclaimed;
664 +
665 + VM_WARN_ON_ONCE(!global_reclaim(sc));
666 +
667 + lru_add_drain();
668 +
669 + blk_start_plug(&plug);
670 +
671 + set_mm_walk(pgdat);
672 +
673 + set_initial_priority(pgdat, sc);
674 +
675 + if (current_is_kswapd())
676 + sc->nr_reclaimed = 0;
677 +
678 + if (mem_cgroup_disabled())
679 + shrink_one(&pgdat->__lruvec, sc);
680 + else
681 + shrink_many(pgdat, sc);
682 +
683 + if (current_is_kswapd())
684 + sc->nr_reclaimed += reclaimed;
685 +
686 clear_mm_walk();
687
688 blk_finish_plug(&plug);
689 +
690 + /* kswapd should never fail */
691 + pgdat->kswapd_failures = 0;
692 }
693
694 +#ifdef CONFIG_MEMCG
695 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
696 +{
697 + int seg;
698 + int old, new;
699 + int bin = prandom_u32_max(MEMCG_NR_BINS);
700 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
701 +
702 + spin_lock(&pgdat->memcg_lru.lock);
703 +
704 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
705 +
706 + seg = 0;
707 + new = old = lruvec->lrugen.gen;
708 +
709 + /* see the comment on MEMCG_NR_GENS */
710 + if (op == MEMCG_LRU_HEAD)
711 + seg = MEMCG_LRU_HEAD;
712 + else if (op == MEMCG_LRU_TAIL)
713 + seg = MEMCG_LRU_TAIL;
714 + else if (op == MEMCG_LRU_OLD)
715 + new = get_memcg_gen(pgdat->memcg_lru.seq);
716 + else if (op == MEMCG_LRU_YOUNG)
717 + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
718 + else
719 + VM_WARN_ON_ONCE(true);
720 +
721 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
722 +
723 + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
724 + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
725 + else
726 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
727 +
728 + pgdat->memcg_lru.nr_memcgs[old]--;
729 + pgdat->memcg_lru.nr_memcgs[new]++;
730 +
731 + lruvec->lrugen.gen = new;
732 + WRITE_ONCE(lruvec->lrugen.seg, seg);
733 +
734 + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
735 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
736 +
737 + spin_unlock(&pgdat->memcg_lru.lock);
738 +}
739 +#endif
740 +
741 /******************************************************************************
742 * state change
743 ******************************************************************************/
744 @@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
745
746 if (!mem_cgroup_disabled()) {
747 rcu_read_lock();
748 +
749 memcg = mem_cgroup_from_id(memcg_id);
750 -#ifdef CONFIG_MEMCG
751 - if (memcg && !css_tryget(&memcg->css))
752 + if (!mem_cgroup_tryget(memcg))
753 memcg = NULL;
754 -#endif
755 +
756 rcu_read_unlock();
757
758 if (!memcg)
759 @@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
760 }
761
762 #ifdef CONFIG_MEMCG
763 +
764 +void lru_gen_init_pgdat(struct pglist_data *pgdat)
765 +{
766 + int i, j;
767 +
768 + spin_lock_init(&pgdat->memcg_lru.lock);
769 +
770 + for (i = 0; i < MEMCG_NR_GENS; i++) {
771 + for (j = 0; j < MEMCG_NR_BINS; j++)
772 + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
773 + }
774 +}
775 +
776 void lru_gen_init_memcg(struct mem_cgroup *memcg)
777 {
778 INIT_LIST_HEAD(&memcg->mm_list.fifo);
779 @@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
780 }
781 }
782 }
783 -#endif
784 +
785 +void lru_gen_online_memcg(struct mem_cgroup *memcg)
786 +{
787 + int gen;
788 + int nid;
789 + int bin = prandom_u32_max(MEMCG_NR_BINS);
790 +
791 + for_each_node(nid) {
792 + struct pglist_data *pgdat = NODE_DATA(nid);
793 + struct lruvec *lruvec = get_lruvec(memcg, nid);
794 +
795 + spin_lock(&pgdat->memcg_lru.lock);
796 +
797 + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
798 +
799 + gen = get_memcg_gen(pgdat->memcg_lru.seq);
800 +
801 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
802 + pgdat->memcg_lru.nr_memcgs[gen]++;
803 +
804 + lruvec->lrugen.gen = gen;
805 +
806 + spin_unlock(&pgdat->memcg_lru.lock);
807 + }
808 +}
809 +
810 +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
811 +{
812 + int nid;
813 +
814 + for_each_node(nid) {
815 + struct lruvec *lruvec = get_lruvec(memcg, nid);
816 +
817 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
818 + }
819 +}
820 +
821 +void lru_gen_release_memcg(struct mem_cgroup *memcg)
822 +{
823 + int gen;
824 + int nid;
825 +
826 + for_each_node(nid) {
827 + struct pglist_data *pgdat = NODE_DATA(nid);
828 + struct lruvec *lruvec = get_lruvec(memcg, nid);
829 +
830 + spin_lock(&pgdat->memcg_lru.lock);
831 +
832 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
833 +
834 + gen = lruvec->lrugen.gen;
835 +
836 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
837 + pgdat->memcg_lru.nr_memcgs[gen]--;
838 +
839 + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
840 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
841 +
842 + spin_unlock(&pgdat->memcg_lru.lock);
843 + }
844 +}
845 +
846 +#endif /* CONFIG_MEMCG */
847
848 static int __init init_lru_gen(void)
849 {
850 @@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
851 {
852 }
853
854 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
855 +{
856 +}
857 +
858 #endif /* CONFIG_LRU_GEN */
859
860 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
861 @@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
862 bool proportional_reclaim;
863 struct blk_plug plug;
864
865 - if (lru_gen_enabled()) {
866 + if (lru_gen_enabled() && !global_reclaim(sc)) {
867 lru_gen_shrink_lruvec(lruvec, sc);
868 return;
869 }
870 @@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
871 struct lruvec *target_lruvec;
872 bool reclaimable = false;
873
874 + if (lru_gen_enabled() && global_reclaim(sc)) {
875 + lru_gen_shrink_node(pgdat, sc);
876 + return;
877 + }
878 +
879 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
880
881 again:
882 --
883 2.40.0
884