735c91f79983be4883252a62458d0367352bad82
[openwrt/openwrt.git] / target / linux / generic / backport-6.1 / 020-v6.3-15-UPSTREAM-mm-multi-gen-LRU-section-for-memcg-LRU.patch
1 From 48c916b812652f9453be5bd45a703728926d41ca Mon Sep 17 00:00:00 2001
2 From: "T.J. Alumbaugh" <talumbau@google.com>
3 Date: Wed, 18 Jan 2023 00:18:24 +0000
4 Subject: [PATCH 15/19] UPSTREAM: mm: multi-gen LRU: section for memcg LRU
5
6 Move memcg LRU code into a dedicated section. Improve the design doc to
7 outline its architecture.
8
9 Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com
10 Change-Id: Id252e420cff7a858acb098cf2b3642da5c40f602
11 Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
12 Cc: Yu Zhao <yuzhao@google.com>
13 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
14 (cherry picked from commit 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842)
15 Bug: 274865848
16 Signed-off-by: T.J. Mercier <tjmercier@google.com>
17 ---
18 Documentation/mm/multigen_lru.rst | 33 +++-
19 include/linux/mm_inline.h | 17 --
20 include/linux/mmzone.h | 13 +-
21 mm/memcontrol.c | 8 +-
22 mm/vmscan.c | 250 +++++++++++++++++-------------
23 5 files changed, 178 insertions(+), 143 deletions(-)
24
25 --- a/Documentation/mm/multigen_lru.rst
26 +++ b/Documentation/mm/multigen_lru.rst
27 @@ -186,9 +186,40 @@ is false positive, the cost is an additi
28 which may yield hot pages anyway. Parameters of the filter itself can
29 control the false positive rate in the limit.
30
31 +Memcg LRU
32 +---------
33 +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
34 +since each node and memcg combination has an LRU of folios (see
35 +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
36 +global reclaim, which is critical to system-wide memory overcommit in
37 +data centers. Note that memcg LRU only applies to global reclaim.
38 +
39 +The basic structure of an memcg LRU can be understood by an analogy to
40 +the active/inactive LRU (of folios):
41 +
42 +1. It has the young and the old (generations), i.e., the counterparts
43 + to the active and the inactive;
44 +2. The increment of ``max_seq`` triggers promotion, i.e., the
45 + counterpart to activation;
46 +3. Other events trigger similar operations, e.g., offlining an memcg
47 + triggers demotion, i.e., the counterpart to deactivation.
48 +
49 +In terms of global reclaim, it has two distinct features:
50 +
51 +1. Sharding, which allows each thread to start at a random memcg (in
52 + the old generation) and improves parallelism;
53 +2. Eventual fairness, which allows direct reclaim to bail out at will
54 + and reduces latency without affecting fairness over some time.
55 +
56 +In terms of traversing memcgs during global reclaim, it improves the
57 +best-case complexity from O(n) to O(1) and does not affect the
58 +worst-case complexity O(n). Therefore, on average, it has a sublinear
59 +complexity.
60 +
61 Summary
62 -------
63 -The multi-gen LRU can be disassembled into the following parts:
64 +The multi-gen LRU (of folios) can be disassembled into the following
65 +parts:
66
67 * Generations
68 * Rmap walks
69 --- a/include/linux/mm_inline.h
70 +++ b/include/linux/mm_inline.h
71 @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void
72 return current->in_lru_fault;
73 }
74
75 -#ifdef CONFIG_MEMCG
76 -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
77 -{
78 - return READ_ONCE(lruvec->lrugen.seg);
79 -}
80 -#else
81 -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
82 -{
83 - return 0;
84 -}
85 -#endif
86 -
87 static inline int lru_gen_from_seq(unsigned long seq)
88 {
89 return seq % MAX_NR_GENS;
90 @@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void
91 return false;
92 }
93
94 -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
95 -{
96 - return 0;
97 -}
98 -
99 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
100 {
101 return false;
102 --- a/include/linux/mmzone.h
103 +++ b/include/linux/mmzone.h
104 @@ -368,15 +368,6 @@ struct page_vma_mapped_walk;
105 #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
106 #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
107
108 -/* see the comment on MEMCG_NR_GENS */
109 -enum {
110 - MEMCG_LRU_NOP,
111 - MEMCG_LRU_HEAD,
112 - MEMCG_LRU_TAIL,
113 - MEMCG_LRU_OLD,
114 - MEMCG_LRU_YOUNG,
115 -};
116 -
117 #ifdef CONFIG_LRU_GEN
118
119 enum {
120 @@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgrou
121 void lru_gen_online_memcg(struct mem_cgroup *memcg);
122 void lru_gen_offline_memcg(struct mem_cgroup *memcg);
123 void lru_gen_release_memcg(struct mem_cgroup *memcg);
124 -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
125 +void lru_gen_soft_reclaim(struct lruvec *lruvec);
126
127 #else /* !CONFIG_MEMCG */
128
129 @@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg
130 {
131 }
132
133 -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
134 +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
135 {
136 }
137
138 --- a/mm/memcontrol.c
139 +++ b/mm/memcontrol.c
140 @@ -478,12 +478,8 @@ static void mem_cgroup_update_tree(struc
141 struct mem_cgroup_tree_per_node *mctz;
142
143 if (lru_gen_enabled()) {
144 - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
145 -
146 - /* see the comment on MEMCG_NR_GENS */
147 - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
148 - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
149 -
150 + if (soft_limit_excess(memcg))
151 + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
152 return;
153 }
154
155 --- a/mm/vmscan.c
156 +++ b/mm/vmscan.c
157 @@ -4690,6 +4690,148 @@ void lru_gen_look_around(struct page_vma
158 }
159
160 /******************************************************************************
161 + * memcg LRU
162 + ******************************************************************************/
163 +
164 +/* see the comment on MEMCG_NR_GENS */
165 +enum {
166 + MEMCG_LRU_NOP,
167 + MEMCG_LRU_HEAD,
168 + MEMCG_LRU_TAIL,
169 + MEMCG_LRU_OLD,
170 + MEMCG_LRU_YOUNG,
171 +};
172 +
173 +#ifdef CONFIG_MEMCG
174 +
175 +static int lru_gen_memcg_seg(struct lruvec *lruvec)
176 +{
177 + return READ_ONCE(lruvec->lrugen.seg);
178 +}
179 +
180 +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
181 +{
182 + int seg;
183 + int old, new;
184 + int bin = get_random_u32_below(MEMCG_NR_BINS);
185 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
186 +
187 + spin_lock(&pgdat->memcg_lru.lock);
188 +
189 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
190 +
191 + seg = 0;
192 + new = old = lruvec->lrugen.gen;
193 +
194 + /* see the comment on MEMCG_NR_GENS */
195 + if (op == MEMCG_LRU_HEAD)
196 + seg = MEMCG_LRU_HEAD;
197 + else if (op == MEMCG_LRU_TAIL)
198 + seg = MEMCG_LRU_TAIL;
199 + else if (op == MEMCG_LRU_OLD)
200 + new = get_memcg_gen(pgdat->memcg_lru.seq);
201 + else if (op == MEMCG_LRU_YOUNG)
202 + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
203 + else
204 + VM_WARN_ON_ONCE(true);
205 +
206 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
207 +
208 + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
209 + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
210 + else
211 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
212 +
213 + pgdat->memcg_lru.nr_memcgs[old]--;
214 + pgdat->memcg_lru.nr_memcgs[new]++;
215 +
216 + lruvec->lrugen.gen = new;
217 + WRITE_ONCE(lruvec->lrugen.seg, seg);
218 +
219 + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
220 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
221 +
222 + spin_unlock(&pgdat->memcg_lru.lock);
223 +}
224 +
225 +void lru_gen_online_memcg(struct mem_cgroup *memcg)
226 +{
227 + int gen;
228 + int nid;
229 + int bin = get_random_u32_below(MEMCG_NR_BINS);
230 +
231 + for_each_node(nid) {
232 + struct pglist_data *pgdat = NODE_DATA(nid);
233 + struct lruvec *lruvec = get_lruvec(memcg, nid);
234 +
235 + spin_lock(&pgdat->memcg_lru.lock);
236 +
237 + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
238 +
239 + gen = get_memcg_gen(pgdat->memcg_lru.seq);
240 +
241 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
242 + pgdat->memcg_lru.nr_memcgs[gen]++;
243 +
244 + lruvec->lrugen.gen = gen;
245 +
246 + spin_unlock(&pgdat->memcg_lru.lock);
247 + }
248 +}
249 +
250 +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
251 +{
252 + int nid;
253 +
254 + for_each_node(nid) {
255 + struct lruvec *lruvec = get_lruvec(memcg, nid);
256 +
257 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
258 + }
259 +}
260 +
261 +void lru_gen_release_memcg(struct mem_cgroup *memcg)
262 +{
263 + int gen;
264 + int nid;
265 +
266 + for_each_node(nid) {
267 + struct pglist_data *pgdat = NODE_DATA(nid);
268 + struct lruvec *lruvec = get_lruvec(memcg, nid);
269 +
270 + spin_lock(&pgdat->memcg_lru.lock);
271 +
272 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
273 +
274 + gen = lruvec->lrugen.gen;
275 +
276 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
277 + pgdat->memcg_lru.nr_memcgs[gen]--;
278 +
279 + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
280 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
281 +
282 + spin_unlock(&pgdat->memcg_lru.lock);
283 + }
284 +}
285 +
286 +void lru_gen_soft_reclaim(struct lruvec *lruvec)
287 +{
288 + /* see the comment on MEMCG_NR_GENS */
289 + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
290 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
291 +}
292 +
293 +#else /* !CONFIG_MEMCG */
294 +
295 +static int lru_gen_memcg_seg(struct lruvec *lruvec)
296 +{
297 + return 0;
298 +}
299 +
300 +#endif
301 +
302 +/******************************************************************************
303 * the eviction
304 ******************************************************************************/
305
306 @@ -5386,53 +5528,6 @@ done:
307 pgdat->kswapd_failures = 0;
308 }
309
310 -#ifdef CONFIG_MEMCG
311 -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
312 -{
313 - int seg;
314 - int old, new;
315 - int bin = get_random_u32_below(MEMCG_NR_BINS);
316 - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
317 -
318 - spin_lock(&pgdat->memcg_lru.lock);
319 -
320 - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
321 -
322 - seg = 0;
323 - new = old = lruvec->lrugen.gen;
324 -
325 - /* see the comment on MEMCG_NR_GENS */
326 - if (op == MEMCG_LRU_HEAD)
327 - seg = MEMCG_LRU_HEAD;
328 - else if (op == MEMCG_LRU_TAIL)
329 - seg = MEMCG_LRU_TAIL;
330 - else if (op == MEMCG_LRU_OLD)
331 - new = get_memcg_gen(pgdat->memcg_lru.seq);
332 - else if (op == MEMCG_LRU_YOUNG)
333 - new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
334 - else
335 - VM_WARN_ON_ONCE(true);
336 -
337 - hlist_nulls_del_rcu(&lruvec->lrugen.list);
338 -
339 - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
340 - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
341 - else
342 - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
343 -
344 - pgdat->memcg_lru.nr_memcgs[old]--;
345 - pgdat->memcg_lru.nr_memcgs[new]++;
346 -
347 - lruvec->lrugen.gen = new;
348 - WRITE_ONCE(lruvec->lrugen.seg, seg);
349 -
350 - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
351 - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
352 -
353 - spin_unlock(&pgdat->memcg_lru.lock);
354 -}
355 -#endif
356 -
357 /******************************************************************************
358 * state change
359 ******************************************************************************/
360 @@ -6078,67 +6173,6 @@ void lru_gen_exit_memcg(struct mem_cgrou
361 }
362 }
363
364 -void lru_gen_online_memcg(struct mem_cgroup *memcg)
365 -{
366 - int gen;
367 - int nid;
368 - int bin = get_random_u32_below(MEMCG_NR_BINS);
369 -
370 - for_each_node(nid) {
371 - struct pglist_data *pgdat = NODE_DATA(nid);
372 - struct lruvec *lruvec = get_lruvec(memcg, nid);
373 -
374 - spin_lock(&pgdat->memcg_lru.lock);
375 -
376 - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
377 -
378 - gen = get_memcg_gen(pgdat->memcg_lru.seq);
379 -
380 - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
381 - pgdat->memcg_lru.nr_memcgs[gen]++;
382 -
383 - lruvec->lrugen.gen = gen;
384 -
385 - spin_unlock(&pgdat->memcg_lru.lock);
386 - }
387 -}
388 -
389 -void lru_gen_offline_memcg(struct mem_cgroup *memcg)
390 -{
391 - int nid;
392 -
393 - for_each_node(nid) {
394 - struct lruvec *lruvec = get_lruvec(memcg, nid);
395 -
396 - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
397 - }
398 -}
399 -
400 -void lru_gen_release_memcg(struct mem_cgroup *memcg)
401 -{
402 - int gen;
403 - int nid;
404 -
405 - for_each_node(nid) {
406 - struct pglist_data *pgdat = NODE_DATA(nid);
407 - struct lruvec *lruvec = get_lruvec(memcg, nid);
408 -
409 - spin_lock(&pgdat->memcg_lru.lock);
410 -
411 - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
412 -
413 - gen = lruvec->lrugen.gen;
414 -
415 - hlist_nulls_del_rcu(&lruvec->lrugen.list);
416 - pgdat->memcg_lru.nr_memcgs[gen]--;
417 -
418 - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
419 - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
420 -
421 - spin_unlock(&pgdat->memcg_lru.lock);
422 - }
423 -}
424 -
425 #endif /* CONFIG_MEMCG */
426
427 static int __init init_lru_gen(void)