1 From 48c916b812652f9453be5bd45a703728926d41ca Mon Sep 17 00:00:00 2001
2 From: "T.J. Alumbaugh" <talumbau@google.com>
3 Date: Wed, 18 Jan 2023 00:18:24 +0000
4 Subject: [PATCH 15/19] UPSTREAM: mm: multi-gen LRU: section for memcg LRU
6 Move memcg LRU code into a dedicated section. Improve the design doc to
7 outline its architecture.
9 Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com
10 Change-Id: Id252e420cff7a858acb098cf2b3642da5c40f602
11 Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
12 Cc: Yu Zhao <yuzhao@google.com>
13 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
14 (cherry picked from commit 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842)
16 Signed-off-by: T.J. Mercier <tjmercier@google.com>
18 Documentation/mm/multigen_lru.rst | 33 +++-
19 include/linux/mm_inline.h | 17 --
20 include/linux/mmzone.h | 13 +-
21 mm/memcontrol.c | 8 +-
22 mm/vmscan.c | 250 +++++++++++++++++-------------
23 5 files changed, 178 insertions(+), 143 deletions(-)
25 --- a/Documentation/mm/multigen_lru.rst
26 +++ b/Documentation/mm/multigen_lru.rst
27 @@ -186,9 +186,40 @@ is false positive, the cost is an additi
28 which may yield hot pages anyway. Parameters of the filter itself can
29 control the false positive rate in the limit.
33 +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
34 +since each node and memcg combination has an LRU of folios (see
35 +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
36 +global reclaim, which is critical to system-wide memory overcommit in
37 +data centers. Note that memcg LRU only applies to global reclaim.
39 +The basic structure of an memcg LRU can be understood by an analogy to
40 +the active/inactive LRU (of folios):
42 +1. It has the young and the old (generations), i.e., the counterparts
43 + to the active and the inactive;
44 +2. The increment of ``max_seq`` triggers promotion, i.e., the
45 + counterpart to activation;
46 +3. Other events trigger similar operations, e.g., offlining an memcg
47 + triggers demotion, i.e., the counterpart to deactivation.
49 +In terms of global reclaim, it has two distinct features:
51 +1. Sharding, which allows each thread to start at a random memcg (in
52 + the old generation) and improves parallelism;
53 +2. Eventual fairness, which allows direct reclaim to bail out at will
54 + and reduces latency without affecting fairness over some time.
56 +In terms of traversing memcgs during global reclaim, it improves the
57 +best-case complexity from O(n) to O(1) and does not affect the
58 +worst-case complexity O(n). Therefore, on average, it has a sublinear
63 -The multi-gen LRU can be disassembled into the following parts:
64 +The multi-gen LRU (of folios) can be disassembled into the following
69 --- a/include/linux/mm_inline.h
70 +++ b/include/linux/mm_inline.h
71 @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void
72 return current->in_lru_fault;
76 -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
78 - return READ_ONCE(lruvec->lrugen.seg);
81 -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
87 static inline int lru_gen_from_seq(unsigned long seq)
89 return seq % MAX_NR_GENS;
90 @@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void
94 -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
99 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
102 --- a/include/linux/mmzone.h
103 +++ b/include/linux/mmzone.h
104 @@ -368,15 +368,6 @@ struct page_vma_mapped_walk;
105 #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
106 #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
108 -/* see the comment on MEMCG_NR_GENS */
117 #ifdef CONFIG_LRU_GEN
120 @@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgrou
121 void lru_gen_online_memcg(struct mem_cgroup *memcg);
122 void lru_gen_offline_memcg(struct mem_cgroup *memcg);
123 void lru_gen_release_memcg(struct mem_cgroup *memcg);
124 -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
125 +void lru_gen_soft_reclaim(struct lruvec *lruvec);
127 #else /* !CONFIG_MEMCG */
129 @@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg
133 -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
134 +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
138 --- a/mm/memcontrol.c
139 +++ b/mm/memcontrol.c
140 @@ -478,12 +478,8 @@ static void mem_cgroup_update_tree(struc
141 struct mem_cgroup_tree_per_node *mctz;
143 if (lru_gen_enabled()) {
144 - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
146 - /* see the comment on MEMCG_NR_GENS */
147 - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
148 - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
150 + if (soft_limit_excess(memcg))
151 + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
157 @@ -4692,6 +4692,148 @@ void lru_gen_look_around(struct page_vma
160 /******************************************************************************
162 + ******************************************************************************/
164 +/* see the comment on MEMCG_NR_GENS */
175 +static int lru_gen_memcg_seg(struct lruvec *lruvec)
177 + return READ_ONCE(lruvec->lrugen.seg);
180 +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
184 + int bin = get_random_u32_below(MEMCG_NR_BINS);
185 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
187 + spin_lock(&pgdat->memcg_lru.lock);
189 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
192 + new = old = lruvec->lrugen.gen;
194 + /* see the comment on MEMCG_NR_GENS */
195 + if (op == MEMCG_LRU_HEAD)
196 + seg = MEMCG_LRU_HEAD;
197 + else if (op == MEMCG_LRU_TAIL)
198 + seg = MEMCG_LRU_TAIL;
199 + else if (op == MEMCG_LRU_OLD)
200 + new = get_memcg_gen(pgdat->memcg_lru.seq);
201 + else if (op == MEMCG_LRU_YOUNG)
202 + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
204 + VM_WARN_ON_ONCE(true);
206 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
208 + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
209 + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
211 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
213 + pgdat->memcg_lru.nr_memcgs[old]--;
214 + pgdat->memcg_lru.nr_memcgs[new]++;
216 + lruvec->lrugen.gen = new;
217 + WRITE_ONCE(lruvec->lrugen.seg, seg);
219 + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
220 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
222 + spin_unlock(&pgdat->memcg_lru.lock);
225 +void lru_gen_online_memcg(struct mem_cgroup *memcg)
229 + int bin = get_random_u32_below(MEMCG_NR_BINS);
231 + for_each_node(nid) {
232 + struct pglist_data *pgdat = NODE_DATA(nid);
233 + struct lruvec *lruvec = get_lruvec(memcg, nid);
235 + spin_lock(&pgdat->memcg_lru.lock);
237 + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
239 + gen = get_memcg_gen(pgdat->memcg_lru.seq);
241 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
242 + pgdat->memcg_lru.nr_memcgs[gen]++;
244 + lruvec->lrugen.gen = gen;
246 + spin_unlock(&pgdat->memcg_lru.lock);
250 +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
254 + for_each_node(nid) {
255 + struct lruvec *lruvec = get_lruvec(memcg, nid);
257 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
261 +void lru_gen_release_memcg(struct mem_cgroup *memcg)
266 + for_each_node(nid) {
267 + struct pglist_data *pgdat = NODE_DATA(nid);
268 + struct lruvec *lruvec = get_lruvec(memcg, nid);
270 + spin_lock(&pgdat->memcg_lru.lock);
272 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
274 + gen = lruvec->lrugen.gen;
276 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
277 + pgdat->memcg_lru.nr_memcgs[gen]--;
279 + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
280 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
282 + spin_unlock(&pgdat->memcg_lru.lock);
286 +void lru_gen_soft_reclaim(struct lruvec *lruvec)
288 + /* see the comment on MEMCG_NR_GENS */
289 + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
290 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
293 +#else /* !CONFIG_MEMCG */
295 +static int lru_gen_memcg_seg(struct lruvec *lruvec)
302 +/******************************************************************************
304 ******************************************************************************/
306 @@ -5398,53 +5540,6 @@ done:
307 pgdat->kswapd_failures = 0;
311 -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
315 - int bin = get_random_u32_below(MEMCG_NR_BINS);
316 - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
318 - spin_lock(&pgdat->memcg_lru.lock);
320 - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
323 - new = old = lruvec->lrugen.gen;
325 - /* see the comment on MEMCG_NR_GENS */
326 - if (op == MEMCG_LRU_HEAD)
327 - seg = MEMCG_LRU_HEAD;
328 - else if (op == MEMCG_LRU_TAIL)
329 - seg = MEMCG_LRU_TAIL;
330 - else if (op == MEMCG_LRU_OLD)
331 - new = get_memcg_gen(pgdat->memcg_lru.seq);
332 - else if (op == MEMCG_LRU_YOUNG)
333 - new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
335 - VM_WARN_ON_ONCE(true);
337 - hlist_nulls_del_rcu(&lruvec->lrugen.list);
339 - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
340 - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
342 - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
344 - pgdat->memcg_lru.nr_memcgs[old]--;
345 - pgdat->memcg_lru.nr_memcgs[new]++;
347 - lruvec->lrugen.gen = new;
348 - WRITE_ONCE(lruvec->lrugen.seg, seg);
350 - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
351 - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
353 - spin_unlock(&pgdat->memcg_lru.lock);
357 /******************************************************************************
359 ******************************************************************************/
360 @@ -6090,67 +6185,6 @@ void lru_gen_exit_memcg(struct mem_cgrou
364 -void lru_gen_online_memcg(struct mem_cgroup *memcg)
368 - int bin = get_random_u32_below(MEMCG_NR_BINS);
370 - for_each_node(nid) {
371 - struct pglist_data *pgdat = NODE_DATA(nid);
372 - struct lruvec *lruvec = get_lruvec(memcg, nid);
374 - spin_lock(&pgdat->memcg_lru.lock);
376 - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
378 - gen = get_memcg_gen(pgdat->memcg_lru.seq);
380 - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
381 - pgdat->memcg_lru.nr_memcgs[gen]++;
383 - lruvec->lrugen.gen = gen;
385 - spin_unlock(&pgdat->memcg_lru.lock);
389 -void lru_gen_offline_memcg(struct mem_cgroup *memcg)
393 - for_each_node(nid) {
394 - struct lruvec *lruvec = get_lruvec(memcg, nid);
396 - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
400 -void lru_gen_release_memcg(struct mem_cgroup *memcg)
405 - for_each_node(nid) {
406 - struct pglist_data *pgdat = NODE_DATA(nid);
407 - struct lruvec *lruvec = get_lruvec(memcg, nid);
409 - spin_lock(&pgdat->memcg_lru.lock);
411 - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
413 - gen = lruvec->lrugen.gen;
415 - hlist_nulls_del_rcu(&lruvec->lrugen.list);
416 - pgdat->memcg_lru.nr_memcgs[gen]--;
418 - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
419 - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
421 - spin_unlock(&pgdat->memcg_lru.lock);
425 #endif /* CONFIG_MEMCG */
427 static int __init init_lru_gen(void)