1 From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Sun, 18 Sep 2022 02:00:06 -0600
4 Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
9 When multiple memcgs are available, it is possible to use generations as a
10 frame of reference to make better choices and improve overall performance
11 under global memory pressure. This patch adds a basic optimization to
12 select memcgs that can drop single-use unmapped clean pages first. Doing
13 so reduces the chance of going into the aging path or swapping, which can
16 A typical example that benefits from this optimization is a server running
17 mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
18 buffered I/O workload in the other.
20 Though this optimization can be applied to both kswapd and direct reclaim,
21 it is only added to kswapd to keep the patchset manageable. Later
22 improvements may cover the direct reclaim path.
24 While ensuring certain fairness to all eligible memcgs, proportional scans
25 of individual memcgs also require proper backoff to avoid overshooting
26 their aggregate reclaim target by too much. Otherwise it can cause high
27 direct reclaim latency. The conditions for backoff are:
29 1. At low priorities, for direct reclaim, if aging fairness or direct
30 reclaim latency is at risk, i.e., aging one memcg multiple times or
31 swapping after the target is met.
32 2. At high priorities, for global reclaim, if per-zone free pages are
33 above respective watermarks.
35 Server benchmark results:
37 fio (buffered I/O): +[19, 21]%
39 patch1-8: 1880k 7343MiB/s
40 patch1-9: 2252k 8796MiB/s
42 memcached (anon): +[119, 123]%
44 patch1-8: 862768.65 33514.68
45 patch1-9: 1911022.12 74234.54
48 fio (buffered I/O): +[75, 77]%
50 5.19-rc1: 1279k 4996MiB/s
51 patch1-9: 2252k 8796MiB/s
53 memcached (anon): +[13, 15]%
55 5.19-rc1: 1673524.04 65008.87
56 patch1-9: 1911022.12 74234.54
59 (changes since patch 6)
62 modprobe brd rd_nr=2 rd_size=56623104
69 mount -t ext4 /dev/ram1 /mnt
71 memtier_benchmark -S /var/run/memcached/memcached.sock \
72 -P memcache_binary -n allkeys --key-minimum=1 \
73 --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
74 --ratio 1:0 --pipeline 8 -d 2000
76 fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
77 --buffered=1 --ioengine=io_uring --iodepth=128 \
78 --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
79 --rw=randread --random_distribution=random --norandommap \
80 --time_based --ramp_time=10m --runtime=90m --group_reporting &
85 memtier_benchmark -S /var/run/memcached/memcached.sock \
86 -P memcache_binary -n allkeys --key-minimum=1 \
87 --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
88 --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
93 Client benchmark results:
94 no change (CONFIG_MEMCG=n)
96 Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
97 Signed-off-by: Yu Zhao <yuzhao@google.com>
98 Acked-by: Brian Geffon <bgeffon@google.com>
99 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
100 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
101 Acked-by: Steven Barrett <steven@liquorix.net>
102 Acked-by: Suleiman Souhlal <suleiman@google.com>
103 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
104 Tested-by: Donald Carr <d@chaos-reins.com>
105 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
106 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
107 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
108 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
109 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
110 Cc: Andi Kleen <ak@linux.intel.com>
111 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
112 Cc: Barry Song <baohua@kernel.org>
113 Cc: Catalin Marinas <catalin.marinas@arm.com>
114 Cc: Dave Hansen <dave.hansen@linux.intel.com>
115 Cc: Hillf Danton <hdanton@sina.com>
116 Cc: Jens Axboe <axboe@kernel.dk>
117 Cc: Johannes Weiner <hannes@cmpxchg.org>
118 Cc: Jonathan Corbet <corbet@lwn.net>
119 Cc: Linus Torvalds <torvalds@linux-foundation.org>
120 Cc: Matthew Wilcox <willy@infradead.org>
121 Cc: Mel Gorman <mgorman@suse.de>
122 Cc: Miaohe Lin <linmiaohe@huawei.com>
123 Cc: Michael Larabel <Michael@MichaelLarabel.com>
124 Cc: Michal Hocko <mhocko@kernel.org>
125 Cc: Mike Rapoport <rppt@kernel.org>
126 Cc: Mike Rapoport <rppt@linux.ibm.com>
127 Cc: Peter Zijlstra <peterz@infradead.org>
128 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
129 Cc: Tejun Heo <tj@kernel.org>
130 Cc: Vlastimil Babka <vbabka@suse.cz>
131 Cc: Will Deacon <will@kernel.org>
132 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
134 mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
135 1 file changed, 96 insertions(+), 9 deletions(-)
139 @@ -127,6 +127,12 @@ struct scan_control {
140 /* Always discard instead of demoting to lower tier memory */
141 unsigned int no_demotion:1;
143 +#ifdef CONFIG_LRU_GEN
144 + /* help kswapd make better choices among multiple memcgs */
145 + unsigned int memcgs_need_aging:1;
146 + unsigned long last_reclaimed;
149 /* Allocation order */
152 @@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
154 VM_WARN_ON_ONCE(!current_is_kswapd());
156 + sc->last_reclaimed = sc->nr_reclaimed;
159 + * To reduce the chance of going into the aging path, which can be
160 + * costly, optimistically skip it if the flag below was cleared in the
161 + * eviction path. This improves the overall performance when multiple
162 + * memcgs are available.
164 + if (!sc->memcgs_need_aging) {
165 + sc->memcgs_need_aging = true;
171 memcg = mem_cgroup_iter(NULL, NULL, NULL);
172 @@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
176 -static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
177 +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
178 + bool *need_swapping)
182 @@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
184 sc->nr_reclaimed += reclaimed;
186 + if (need_swapping && type == LRU_GEN_ANON)
187 + *need_swapping = true;
192 @@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
195 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
197 + bool can_swap, bool *need_aging)
200 unsigned long nr_to_scan;
201 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
202 DEFINE_MAX_SEQ(lruvec);
203 @@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
204 (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
207 - need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
209 + *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
213 /* skip the aging path at the default priority */
214 @@ -4715,10 +4737,68 @@ done:
215 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
218 +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
219 + struct scan_control *sc, bool need_swapping)
222 + DEFINE_MAX_SEQ(lruvec);
224 + if (!current_is_kswapd()) {
225 + /* age each memcg once to ensure fairness */
226 + if (max_seq - seq > 1)
229 + /* over-swapping can increase allocation latency */
230 + if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
233 + /* give this thread a chance to exit and free its memory */
234 + if (fatal_signal_pending(current)) {
235 + sc->nr_reclaimed += MIN_LRU_BATCH;
239 + if (cgroup_reclaim(sc))
241 + } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
244 + /* keep scanning at low priorities to ensure fairness */
245 + if (sc->priority > DEF_PRIORITY - 2)
249 + * A minimum amount of work was done under global memory pressure. For
250 + * kswapd, it may be overshooting. For direct reclaim, the target isn't
251 + * met, and yet the allocation may still succeed, since kswapd may have
252 + * caught up. In either case, it's better to stop now, and restart if
255 + for (i = 0; i <= sc->reclaim_idx; i++) {
256 + unsigned long wmark;
257 + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
259 + if (!managed_zone(zone))
262 + wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
263 + if (wmark > zone_page_state(zone, NR_FREE_PAGES))
267 + sc->nr_reclaimed += MIN_LRU_BATCH;
272 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
274 struct blk_plug plug;
275 + bool need_aging = false;
276 + bool need_swapping = false;
277 unsigned long scanned = 0;
278 + unsigned long reclaimed = sc->nr_reclaimed;
279 + DEFINE_MAX_SEQ(lruvec);
283 @@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
287 - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
288 + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
293 - delta = evict_pages(lruvec, sc, swappiness);
294 + delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
300 if (scanned >= nr_to_scan)
303 + if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
309 + /* see the comment in lru_gen_age_node() */
310 + if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
311 + sc->memcgs_need_aging = false;
315 blk_finish_plug(&plug);