kernel: bump 5.15 to 5.15.100
[openwrt/openwrt.git] / target / linux / generic / backport-5.15 / 020-v6.1-04-mm-multigenerational-lru-groundwork.patch
1 From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Mon, 25 Jan 2021 21:12:33 -0700
4 Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
5
6 For each lruvec, evictable pages are divided into multiple
7 generations. The youngest generation number is stored in
8 lrugen->max_seq for both anon and file types as they are aged on an
9 equal footing. The oldest generation numbers are stored in
10 lrugen->min_seq[] separately for anon and file types as clean file
11 pages can be evicted regardless of swap constraints. These three
12 variables are monotonically increasing. Generation numbers are
13 truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
14 page->flags. The sliding window technique is used to prevent truncated
15 generation numbers from overlapping. Each truncated generation number
16 is an index to
17 lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
18
19 The framework comprises two conceptually independent components: the
20 aging, which produces young generations, and the eviction, which
21 consumes old generations. Both can be invoked independently from user
22 space for the purpose of working set estimation and proactive reclaim.
23
24 The protection of hot pages and the selection of cold pages are based
25 on page access types and patterns. There are two access types: one via
26 page tables and the other via file descriptors. The protection of the
27 former type is by design stronger because:
28 1) The uncertainty in determining the access patterns of the former
29 type is higher due to the coalesced nature of the accessed bit.
30 2) The cost of evicting the former type is higher due to the TLB
31 flushes required and the likelihood of involving I/O.
32 3) The penalty of under-protecting the former type is higher because
33 applications usually do not prepare themselves for major faults like
34 they do for blocked I/O. For example, client applications commonly
35 dedicate blocked I/O to separate threads to avoid UI janks that
36 negatively affect user experience.
37
38 There are also two access patterns: one with temporal locality and the
39 other without. The latter pattern, e.g., random and sequential, needs
40 to be explicitly excluded to avoid weakening the protection of the
41 former pattern. Generally the former type follows the former pattern
42 unless MADV_SEQUENTIAL is specified and the latter type follows the
43 latter pattern unless outlying refaults have been observed.
44
45 Upon faulting, a page is added to the youngest generation, which
46 provides the strongest protection as the eviction will not consider
47 this page before the aging has scanned it at least twice. The first
48 scan clears the accessed bit set during the initial fault. And the
49 second scan makes sure this page has not been used since the first
50 scan. A page from any other generations is brought back to the
51 youngest generation whenever the aging finds the accessed bit set on
52 any of the PTEs mapping this page.
53
54 Unmapped pages are initially added to the oldest generation and then
55 conditionally protected by tiers. This is done later [PATCH 07/10].
56
57 Signed-off-by: Yu Zhao <yuzhao@google.com>
58 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
59 Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
60 ---
61 fs/fuse/dev.c | 3 +-
62 include/linux/cgroup.h | 15 +-
63 include/linux/mm.h | 36 ++++
64 include/linux/mm_inline.h | 182 ++++++++++++++++++++
65 include/linux/mmzone.h | 70 ++++++++
66 include/linux/page-flags-layout.h | 19 ++-
67 include/linux/page-flags.h | 4 +-
68 include/linux/sched.h | 3 +
69 kernel/bounds.c | 3 +
70 kernel/cgroup/cgroup-internal.h | 1 -
71 mm/huge_memory.c | 3 +-
72 mm/memcontrol.c | 1 +
73 mm/memory.c | 7 +
74 mm/mm_init.c | 6 +-
75 mm/page_alloc.c | 1 +
76 mm/swap.c | 9 +-
77 mm/swapfile.c | 2 +
78 mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
79 18 files changed, 618 insertions(+), 15 deletions(-)
80
81 --- a/fs/fuse/dev.c
82 +++ b/fs/fuse/dev.c
83 @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
84 1 << PG_active |
85 1 << PG_workingset |
86 1 << PG_reclaim |
87 - 1 << PG_waiters))) {
88 + 1 << PG_waiters |
89 + LRU_GEN_MASK | LRU_REFS_MASK))) {
90 dump_page(page, "fuse: trying to steal weird page");
91 return 1;
92 }
93 --- a/include/linux/cgroup.h
94 +++ b/include/linux/cgroup.h
95 @@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
96 css_put(&cgrp->self);
97 }
98
99 +extern struct mutex cgroup_mutex;
100 +
101 +static inline void cgroup_lock(void)
102 +{
103 + mutex_lock(&cgroup_mutex);
104 +}
105 +
106 +static inline void cgroup_unlock(void)
107 +{
108 + mutex_unlock(&cgroup_mutex);
109 +}
110 +
111 /**
112 * task_css_set_check - obtain a task's css_set with extra access conditions
113 * @task: the task to obtain css_set for
114 @@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
115 * as locks used during the cgroup_subsys::attach() methods.
116 */
117 #ifdef CONFIG_PROVE_RCU
118 -extern struct mutex cgroup_mutex;
119 extern spinlock_t css_set_lock;
120 #define task_css_set_check(task, __c) \
121 rcu_dereference_check((task)->cgroups, \
122 @@ -708,6 +719,8 @@ struct cgroup;
123 static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
124 static inline void css_get(struct cgroup_subsys_state *css) {}
125 static inline void css_put(struct cgroup_subsys_state *css) {}
126 +static inline void cgroup_lock(void) {}
127 +static inline void cgroup_unlock(void) {}
128 static inline int cgroup_attach_task_all(struct task_struct *from,
129 struct task_struct *t) { return 0; }
130 static inline int cgroupstats_build(struct cgroupstats *stats,
131 --- a/include/linux/mm.h
132 +++ b/include/linux/mm.h
133 @@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
134 #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
135 #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
136 #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
137 +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
138 +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
139
140 /*
141 * Define the bit shifts to access each section. For non-existent
142 @@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
143 loff_t const holebegin, loff_t const holelen, int even_cows) { }
144 #endif
145
146 +#ifdef CONFIG_LRU_GEN
147 +static inline void task_enter_nonseq_fault(void)
148 +{
149 + WARN_ON(current->in_nonseq_fault);
150 +
151 + current->in_nonseq_fault = 1;
152 +}
153 +
154 +static inline void task_exit_nonseq_fault(void)
155 +{
156 + WARN_ON(!current->in_nonseq_fault);
157 +
158 + current->in_nonseq_fault = 0;
159 +}
160 +
161 +static inline bool task_in_nonseq_fault(void)
162 +{
163 + return current->in_nonseq_fault;
164 +}
165 +#else
166 +static inline void task_enter_nonseq_fault(void)
167 +{
168 +}
169 +
170 +static inline void task_exit_nonseq_fault(void)
171 +{
172 +}
173 +
174 +static inline bool task_in_nonseq_fault(void)
175 +{
176 + return false;
177 +}
178 +#endif /* CONFIG_LRU_GEN */
179 +
180 static inline void unmap_shared_mapping_range(struct address_space *mapping,
181 loff_t const holebegin, loff_t const holelen)
182 {
183 --- a/include/linux/mm_inline.h
184 +++ b/include/linux/mm_inline.h
185 @@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
186 return lru;
187 }
188
189 +#ifdef CONFIG_LRU_GEN
190 +
191 +static inline bool lru_gen_enabled(void)
192 +{
193 +#ifdef CONFIG_LRU_GEN_ENABLED
194 + DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
195 +
196 + return static_branch_likely(&lru_gen_static_key);
197 +#else
198 + DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
199 +
200 + return static_branch_unlikely(&lru_gen_static_key);
201 +#endif
202 +}
203 +
204 +/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
205 +static inline int lru_gen_from_seq(unsigned long seq)
206 +{
207 + return seq % MAX_NR_GENS;
208 +}
209 +
210 +/* The youngest and the second youngest generations are counted as active. */
211 +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
212 +{
213 + unsigned long max_seq = lruvec->evictable.max_seq;
214 +
215 + VM_BUG_ON(gen >= MAX_NR_GENS);
216 +
217 + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
218 +}
219 +
220 +/* Update the sizes of the multigenerational lru lists. */
221 +static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
222 + int old_gen, int new_gen)
223 +{
224 + int type = page_is_file_lru(page);
225 + int zone = page_zonenum(page);
226 + int delta = thp_nr_pages(page);
227 + enum lru_list lru = type * LRU_FILE;
228 + struct lrugen *lrugen = &lruvec->evictable;
229 +
230 + lockdep_assert_held(&lruvec->lru_lock);
231 + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
232 + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
233 + VM_BUG_ON(old_gen == -1 && new_gen == -1);
234 +
235 + if (old_gen >= 0)
236 + WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
237 + lrugen->sizes[old_gen][type][zone] - delta);
238 + if (new_gen >= 0)
239 + WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
240 + lrugen->sizes[new_gen][type][zone] + delta);
241 +
242 + if (old_gen < 0) {
243 + if (lru_gen_is_active(lruvec, new_gen))
244 + lru += LRU_ACTIVE;
245 + update_lru_size(lruvec, lru, zone, delta);
246 + return;
247 + }
248 +
249 + if (new_gen < 0) {
250 + if (lru_gen_is_active(lruvec, old_gen))
251 + lru += LRU_ACTIVE;
252 + update_lru_size(lruvec, lru, zone, -delta);
253 + return;
254 + }
255 +
256 + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
257 + update_lru_size(lruvec, lru, zone, -delta);
258 + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
259 + }
260 +
261 + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
262 +}
263 +
264 +/* Add a page to one of the multigenerational lru lists. Return true on success. */
265 +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
266 +{
267 + int gen;
268 + unsigned long old_flags, new_flags;
269 + int type = page_is_file_lru(page);
270 + int zone = page_zonenum(page);
271 + struct lrugen *lrugen = &lruvec->evictable;
272 +
273 + if (PageUnevictable(page) || !lrugen->enabled[type])
274 + return false;
275 + /*
276 + * If a page shouldn't be considered for eviction, i.e., a page mapped
277 + * upon fault during which the accessed bit is set, add it to the
278 + * youngest generation.
279 + *
280 + * If a page can't be evicted immediately, i.e., an anon page not in
281 + * swap cache or a dirty page pending writeback, add it to the second
282 + * oldest generation.
283 + *
284 + * If a page could be evicted immediately, e.g., a clean page, add it to
285 + * the oldest generation.
286 + */
287 + if (PageActive(page))
288 + gen = lru_gen_from_seq(lrugen->max_seq);
289 + else if ((!type && !PageSwapCache(page)) ||
290 + (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
291 + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
292 + else
293 + gen = lru_gen_from_seq(lrugen->min_seq[type]);
294 +
295 + do {
296 + new_flags = old_flags = READ_ONCE(page->flags);
297 + VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
298 +
299 + new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
300 + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
301 + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
302 +
303 + lru_gen_update_size(page, lruvec, -1, gen);
304 + /* for rotate_reclaimable_page() */
305 + if (reclaiming)
306 + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
307 + else
308 + list_add(&page->lru, &lrugen->lists[gen][type][zone]);
309 +
310 + return true;
311 +}
312 +
313 +/* Delete a page from one of the multigenerational lru lists. Return true on success. */
314 +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
315 +{
316 + int gen;
317 + unsigned long old_flags, new_flags;
318 +
319 + do {
320 + new_flags = old_flags = READ_ONCE(page->flags);
321 + if (!(new_flags & LRU_GEN_MASK))
322 + return false;
323 +
324 + VM_BUG_ON_PAGE(PageActive(page), page);
325 + VM_BUG_ON_PAGE(PageUnevictable(page), page);
326 +
327 + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
328 +
329 + new_flags &= ~LRU_GEN_MASK;
330 + /* for shrink_page_list() */
331 + if (reclaiming)
332 + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
333 + else if (lru_gen_is_active(lruvec, gen))
334 + new_flags |= BIT(PG_active);
335 + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
336 +
337 + lru_gen_update_size(page, lruvec, gen, -1);
338 + list_del(&page->lru);
339 +
340 + return true;
341 +}
342 +
343 +#else
344 +
345 +static inline bool lru_gen_enabled(void)
346 +{
347 + return false;
348 +}
349 +
350 +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
351 +{
352 + return false;
353 +}
354 +
355 +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
356 +{
357 + return false;
358 +}
359 +
360 +#endif /* CONFIG_LRU_GEN */
361 +
362 static __always_inline void add_page_to_lru_list(struct page *page,
363 struct lruvec *lruvec)
364 {
365 enum lru_list lru = page_lru(page);
366
367 + if (lru_gen_add_page(page, lruvec, false))
368 + return;
369 +
370 update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
371 list_add(&page->lru, &lruvec->lists[lru]);
372 }
373 @@ -93,6 +269,9 @@ static __always_inline void add_page_to_
374 {
375 enum lru_list lru = page_lru(page);
376
377 + if (lru_gen_add_page(page, lruvec, true))
378 + return;
379 +
380 update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
381 list_add_tail(&page->lru, &lruvec->lists[lru]);
382 }
383 @@ -100,6 +279,9 @@ static __always_inline void add_page_to_
384 static __always_inline void del_page_from_lru_list(struct page *page,
385 struct lruvec *lruvec)
386 {
387 + if (lru_gen_del_page(page, lruvec, false))
388 + return;
389 +
390 list_del(&page->lru);
391 update_lru_size(lruvec, page_lru(page), page_zonenum(page),
392 -thp_nr_pages(page));
393 --- a/include/linux/mmzone.h
394 +++ b/include/linux/mmzone.h
395 @@ -294,6 +294,72 @@ enum lruvec_flags {
396 */
397 };
398
399 +struct lruvec;
400 +
401 +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
402 +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
403 +
404 +#ifdef CONFIG_LRU_GEN
405 +
406 +/*
407 + * For each lruvec, evictable pages are divided into multiple generations. The
408 + * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
409 + * monotonically increasing. The sliding window technique is used to track at
410 + * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
411 + * window, AKA gen, indexes an array of per-type and per-zone lists for the
412 + * corresponding generation. The counter in page->flags stores gen+1 while a
413 + * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
414 + *
415 + * After a page is faulted in, the aging must check the accessed bit at least
416 + * twice before the eviction would consider it. The first check clears the
417 + * accessed bit set during the initial fault. The second check makes sure this
418 + * page hasn't been used since then.
419 + */
420 +#define MIN_NR_GENS 2
421 +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
422 +
423 +struct lrugen {
424 + /* the aging increments the max generation number */
425 + unsigned long max_seq;
426 + /* the eviction increments the min generation numbers */
427 + unsigned long min_seq[ANON_AND_FILE];
428 + /* the birth time of each generation in jiffies */
429 + unsigned long timestamps[MAX_NR_GENS];
430 + /* the multigenerational lru lists */
431 + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
432 + /* the sizes of the multigenerational lru lists in pages */
433 + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
434 + /* whether the multigenerational lru is enabled */
435 + bool enabled[ANON_AND_FILE];
436 +};
437 +
438 +#define MAX_BATCH_SIZE 8192
439 +
440 +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
441 +void lru_gen_change_state(bool enable, bool main, bool swap);
442 +
443 +#ifdef CONFIG_MEMCG
444 +void lru_gen_init_memcg(struct mem_cgroup *memcg);
445 +#endif
446 +
447 +#else /* !CONFIG_LRU_GEN */
448 +
449 +static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
450 +{
451 +}
452 +
453 +static inline void lru_gen_change_state(bool enable, bool main, bool swap)
454 +{
455 +}
456 +
457 +#ifdef CONFIG_MEMCG
458 +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
459 +{
460 +}
461 +#endif
462 +
463 +#endif /* CONFIG_LRU_GEN */
464 +
465 struct lruvec {
466 struct list_head lists[NR_LRU_LISTS];
467 /* per lruvec lru_lock for memcg */
468 @@ -311,6 +377,10 @@ struct lruvec {
469 unsigned long refaults[ANON_AND_FILE];
470 /* Various lruvec state flags (enum lruvec_flags) */
471 unsigned long flags;
472 +#ifdef CONFIG_LRU_GEN
473 + /* unevictable pages are on LRU_UNEVICTABLE */
474 + struct lrugen evictable;
475 +#endif
476 #ifdef CONFIG_MEMCG
477 struct pglist_data *pgdat;
478 #endif
479 --- a/include/linux/page-flags-layout.h
480 +++ b/include/linux/page-flags-layout.h
481 @@ -26,6 +26,14 @@
482
483 #define ZONES_WIDTH ZONES_SHIFT
484
485 +#ifdef CONFIG_LRU_GEN
486 +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
487 +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
488 +#else
489 +#define LRU_GEN_WIDTH 0
490 +#define LRU_REFS_WIDTH 0
491 +#endif /* CONFIG_LRU_GEN */
492 +
493 #ifdef CONFIG_SPARSEMEM
494 #include <asm/sparsemem.h>
495 #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
496 @@ -55,7 +63,8 @@
497 #define SECTIONS_WIDTH 0
498 #endif
499
500 -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
501 +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
502 + <= BITS_PER_LONG - NR_PAGEFLAGS
503 #define NODES_WIDTH NODES_SHIFT
504 #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
505 #error "Vmemmap: No space for nodes field in page flags"
506 @@ -89,8 +98,8 @@
507 #define LAST_CPUPID_SHIFT 0
508 #endif
509
510 -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
511 - <= BITS_PER_LONG - NR_PAGEFLAGS
512 +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
513 + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
514 #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
515 #else
516 #define LAST_CPUPID_WIDTH 0
517 @@ -100,8 +109,8 @@
518 #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
519 #endif
520
521 -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
522 - > BITS_PER_LONG - NR_PAGEFLAGS
523 +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
524 + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
525 #error "Not enough bits in page flags"
526 #endif
527
528 --- a/include/linux/page-flags.h
529 +++ b/include/linux/page-flags.h
530 @@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
531 1UL << PG_private | 1UL << PG_private_2 | \
532 1UL << PG_writeback | 1UL << PG_reserved | \
533 1UL << PG_slab | 1UL << PG_active | \
534 - 1UL << PG_unevictable | __PG_MLOCKED)
535 + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
536
537 /*
538 * Flags checked when a page is prepped for return by the page allocator.
539 @@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
540 * alloc-free cycle to prevent from reusing the page.
541 */
542 #define PAGE_FLAGS_CHECK_AT_PREP \
543 - (PAGEFLAGS_MASK & ~__PG_HWPOISON)
544 + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
545
546 #define PAGE_FLAGS_PRIVATE \
547 (1UL << PG_private | 1UL << PG_private_2)
548 --- a/include/linux/sched.h
549 +++ b/include/linux/sched.h
550 @@ -911,6 +911,9 @@ struct task_struct {
551 #ifdef CONFIG_MEMCG
552 unsigned in_user_fault:1;
553 #endif
554 +#ifdef CONFIG_LRU_GEN
555 + unsigned in_nonseq_fault:1;
556 +#endif
557 #ifdef CONFIG_COMPAT_BRK
558 unsigned brk_randomized:1;
559 #endif
560 --- a/kernel/bounds.c
561 +++ b/kernel/bounds.c
562 @@ -22,6 +22,9 @@ int main(void)
563 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
564 #endif
565 DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
566 +#ifdef CONFIG_LRU_GEN
567 + DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
568 +#endif
569 /* End of constants */
570
571 return 0;
572 --- a/kernel/cgroup/cgroup-internal.h
573 +++ b/kernel/cgroup/cgroup-internal.h
574 @@ -165,7 +165,6 @@ struct cgroup_mgctx {
575 #define DEFINE_CGROUP_MGCTX(name) \
576 struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
577
578 -extern struct mutex cgroup_mutex;
579 extern spinlock_t css_set_lock;
580 extern struct cgroup_subsys *cgroup_subsys[];
581 extern struct list_head cgroup_roots;
582 --- a/mm/huge_memory.c
583 +++ b/mm/huge_memory.c
584 @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
585 #ifdef CONFIG_64BIT
586 (1L << PG_arch_2) |
587 #endif
588 - (1L << PG_dirty)));
589 + (1L << PG_dirty) |
590 + LRU_GEN_MASK | LRU_REFS_MASK));
591
592 /* ->mapping in first tail page is compound_mapcount */
593 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
594 --- a/mm/memcontrol.c
595 +++ b/mm/memcontrol.c
596 @@ -5241,6 +5241,7 @@ static struct mem_cgroup *mem_cgroup_all
597 memcg->deferred_split_queue.split_queue_len = 0;
598 #endif
599 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
600 + lru_gen_init_memcg(memcg);
601 return memcg;
602 fail:
603 mem_cgroup_id_remove(memcg);
604 --- a/mm/memory.c
605 +++ b/mm/memory.c
606 @@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
607 unsigned int flags, struct pt_regs *regs)
608 {
609 vm_fault_t ret;
610 + bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
611
612 __set_current_state(TASK_RUNNING);
613
614 @@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
615 if (flags & FAULT_FLAG_USER)
616 mem_cgroup_enter_user_fault();
617
618 + if (nonseq_fault)
619 + task_enter_nonseq_fault();
620 +
621 if (unlikely(is_vm_hugetlb_page(vma)))
622 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
623 else
624 ret = __handle_mm_fault(vma, address, flags);
625
626 + if (nonseq_fault)
627 + task_exit_nonseq_fault();
628 +
629 if (flags & FAULT_FLAG_USER) {
630 mem_cgroup_exit_user_fault();
631 /*
632 --- a/mm/mm_init.c
633 +++ b/mm/mm_init.c
634 @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
635
636 shift = 8 * sizeof(unsigned long);
637 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
638 - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
639 + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
640 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
641 - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
642 + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
643 SECTIONS_WIDTH,
644 NODES_WIDTH,
645 ZONES_WIDTH,
646 LAST_CPUPID_WIDTH,
647 KASAN_TAG_WIDTH,
648 + LRU_GEN_WIDTH,
649 + LRU_REFS_WIDTH,
650 NR_PAGEFLAGS);
651 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
652 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
653 --- a/mm/page_alloc.c
654 +++ b/mm/page_alloc.c
655 @@ -7459,6 +7459,7 @@ static void __meminit pgdat_init_interna
656
657 pgdat_page_ext_init(pgdat);
658 lruvec_init(&pgdat->__lruvec);
659 + lru_gen_init_state(NULL, &pgdat->__lruvec);
660 }
661
662 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
663 --- a/mm/swap.c
664 +++ b/mm/swap.c
665 @@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
666 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
667 VM_BUG_ON_PAGE(PageLRU(page), page);
668
669 + /* see the comment in lru_gen_add_page() */
670 + if (lru_gen_enabled() && !PageUnevictable(page) &&
671 + task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
672 + SetPageActive(page);
673 +
674 get_page(page);
675 local_lock(&lru_pvecs.lock);
676 pvec = this_cpu_ptr(&lru_pvecs.lru_add);
677 @@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
678
679 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
680 {
681 - if (PageActive(page) && !PageUnevictable(page)) {
682 + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
683 int nr_pages = thp_nr_pages(page);
684
685 del_page_from_lru_list(page, lruvec);
686 @@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
687 */
688 void deactivate_page(struct page *page)
689 {
690 - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
691 + if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
692 struct pagevec *pvec;
693
694 local_lock(&lru_pvecs.lock);
695 --- a/mm/swapfile.c
696 +++ b/mm/swapfile.c
697 @@ -2689,6 +2689,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
698 err = 0;
699 atomic_inc(&proc_poll_event);
700 wake_up_interruptible(&proc_poll_wait);
701 + lru_gen_change_state(false, false, true);
702
703 out_dput:
704 filp_close(victim, NULL);
705 @@ -3350,6 +3351,7 @@ SYSCALL_DEFINE2(swapon, const char __use
706 mutex_unlock(&swapon_mutex);
707 atomic_inc(&proc_poll_event);
708 wake_up_interruptible(&proc_poll_wait);
709 + lru_gen_change_state(true, false, true);
710
711 error = 0;
712 goto out;
713 --- a/mm/vmscan.c
714 +++ b/mm/vmscan.c
715 @@ -50,6 +50,7 @@
716 #include <linux/printk.h>
717 #include <linux/dax.h>
718 #include <linux/psi.h>
719 +#include <linux/memory.h>
720
721 #include <asm/tlbflush.h>
722 #include <asm/div64.h>
723 @@ -2815,6 +2816,273 @@ static bool can_age_anon_pages(struct pg
724 return can_demote(pgdat->node_id, sc);
725 }
726
727 +#ifdef CONFIG_LRU_GEN
728 +
729 +/******************************************************************************
730 + * shorthand helpers
731 + ******************************************************************************/
732 +
733 +#define for_each_gen_type_zone(gen, type, zone) \
734 + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
735 + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
736 + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
737 +
738 +static int page_lru_gen(struct page *page)
739 +{
740 + unsigned long flags = READ_ONCE(page->flags);
741 +
742 + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
743 +}
744 +
745 +static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
746 +{
747 + struct pglist_data *pgdat = NODE_DATA(nid);
748 +
749 +#ifdef CONFIG_MEMCG
750 + if (memcg) {
751 + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
752 +
753 + if (lruvec->pgdat != pgdat)
754 + lruvec->pgdat = pgdat;
755 +
756 + return lruvec;
757 + }
758 +#endif
759 + return pgdat ? &pgdat->__lruvec : NULL;
760 +}
761 +
762 +static int get_nr_gens(struct lruvec *lruvec, int type)
763 +{
764 + return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
765 +}
766 +
767 +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
768 +{
769 + return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
770 + get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
771 + get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
772 +}
773 +
774 +/******************************************************************************
775 + * state change
776 + ******************************************************************************/
777 +
778 +#ifdef CONFIG_LRU_GEN_ENABLED
779 +DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
780 +#else
781 +DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
782 +#endif
783 +
784 +static int lru_gen_nr_swapfiles;
785 +
786 +static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
787 +{
788 + int gen, type, zone;
789 + enum lru_list lru;
790 + struct lrugen *lrugen = &lruvec->evictable;
791 +
792 + for_each_evictable_lru(lru) {
793 + type = is_file_lru(lru);
794 +
795 + if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
796 + return false;
797 + }
798 +
799 + for_each_gen_type_zone(gen, type, zone) {
800 + if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
801 + return false;
802 +
803 + /* unlikely but not a bug when reset_batch_size() is pending */
804 + VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
805 + }
806 +
807 + return true;
808 +}
809 +
810 +static bool fill_lists(struct lruvec *lruvec)
811 +{
812 + enum lru_list lru;
813 + int remaining = MAX_BATCH_SIZE;
814 +
815 + for_each_evictable_lru(lru) {
816 + int type = is_file_lru(lru);
817 + bool active = is_active_lru(lru);
818 + struct list_head *head = &lruvec->lists[lru];
819 +
820 + if (!lruvec->evictable.enabled[type])
821 + continue;
822 +
823 + while (!list_empty(head)) {
824 + bool success;
825 + struct page *page = lru_to_page(head);
826 +
827 + VM_BUG_ON_PAGE(PageTail(page), page);
828 + VM_BUG_ON_PAGE(PageUnevictable(page), page);
829 + VM_BUG_ON_PAGE(PageActive(page) != active, page);
830 + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
831 + VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
832 +
833 + prefetchw_prev_lru_page(page, head, flags);
834 +
835 + del_page_from_lru_list(page, lruvec);
836 + success = lru_gen_add_page(page, lruvec, false);
837 + VM_BUG_ON(!success);
838 +
839 + if (!--remaining)
840 + return false;
841 + }
842 + }
843 +
844 + return true;
845 +}
846 +
847 +static bool drain_lists(struct lruvec *lruvec)
848 +{
849 + int gen, type, zone;
850 + int remaining = MAX_BATCH_SIZE;
851 +
852 + for_each_gen_type_zone(gen, type, zone) {
853 + struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
854 +
855 + if (lruvec->evictable.enabled[type])
856 + continue;
857 +
858 + while (!list_empty(head)) {
859 + bool success;
860 + struct page *page = lru_to_page(head);
861 +
862 + VM_BUG_ON_PAGE(PageTail(page), page);
863 + VM_BUG_ON_PAGE(PageUnevictable(page), page);
864 + VM_BUG_ON_PAGE(PageActive(page), page);
865 + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
866 + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
867 +
868 + prefetchw_prev_lru_page(page, head, flags);
869 +
870 + success = lru_gen_del_page(page, lruvec, false);
871 + VM_BUG_ON(!success);
872 + add_page_to_lru_list(page, lruvec);
873 +
874 + if (!--remaining)
875 + return false;
876 + }
877 + }
878 +
879 + return true;
880 +}
881 +
882 +/*
883 + * For file page tracking, we enable/disable it according to the main switch.
884 + * For anon page tracking, we only enabled it when the main switch is on and
885 + * there is at least one swapfile; we disable it when there are no swapfiles
886 + * regardless of the value of the main switch. Otherwise, we will eventually
887 + * reach the max size of the sliding window and have to call inc_min_seq().
888 + */
889 +void lru_gen_change_state(bool enable, bool main, bool swap)
890 +{
891 + static DEFINE_MUTEX(state_mutex);
892 +
893 + struct mem_cgroup *memcg;
894 +
895 + mem_hotplug_begin();
896 + cgroup_lock();
897 + mutex_lock(&state_mutex);
898 +
899 + if (swap) {
900 + if (enable)
901 + swap = !lru_gen_nr_swapfiles++;
902 + else
903 + swap = !--lru_gen_nr_swapfiles;
904 + }
905 +
906 + if (main && enable != lru_gen_enabled()) {
907 + if (enable)
908 + static_branch_enable(&lru_gen_static_key);
909 + else
910 + static_branch_disable(&lru_gen_static_key);
911 + } else if (!swap || !lru_gen_enabled())
912 + goto unlock;
913 +
914 + memcg = mem_cgroup_iter(NULL, NULL, NULL);
915 + do {
916 + int nid;
917 +
918 + for_each_node(nid) {
919 + struct lruvec *lruvec = get_lruvec(nid, memcg);
920 +
921 + if (!lruvec)
922 + continue;
923 +
924 + spin_lock_irq(&lruvec->lru_lock);
925 +
926 + VM_BUG_ON(!seq_is_valid(lruvec));
927 + VM_BUG_ON(!state_is_valid(lruvec));
928 +
929 + lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
930 + lruvec->evictable.enabled[1] = lru_gen_enabled();
931 +
932 + while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
933 + spin_unlock_irq(&lruvec->lru_lock);
934 + cond_resched();
935 + spin_lock_irq(&lruvec->lru_lock);
936 + }
937 +
938 + spin_unlock_irq(&lruvec->lru_lock);
939 + }
940 +
941 + cond_resched();
942 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
943 +unlock:
944 + mutex_unlock(&state_mutex);
945 + cgroup_unlock();
946 + mem_hotplug_done();
947 +}
948 +
949 +/******************************************************************************
950 + * initialization
951 + ******************************************************************************/
952 +
953 +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
954 +{
955 + int i;
956 + int gen, type, zone;
957 + struct lrugen *lrugen = &lruvec->evictable;
958 +
959 + lrugen->max_seq = MIN_NR_GENS + 1;
960 + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
961 + lrugen->enabled[1] = lru_gen_enabled();
962 +
963 + for (i = 0; i <= MIN_NR_GENS + 1; i++)
964 + lrugen->timestamps[i] = jiffies;
965 +
966 + for_each_gen_type_zone(gen, type, zone)
967 + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
968 +}
969 +
970 +#ifdef CONFIG_MEMCG
971 +void lru_gen_init_memcg(struct mem_cgroup *memcg)
972 +{
973 + int nid;
974 +
975 + for_each_node(nid) {
976 + struct lruvec *lruvec = get_lruvec(nid, memcg);
977 +
978 + lru_gen_init_state(memcg, lruvec);
979 + }
980 +}
981 +#endif
982 +
983 +static int __init init_lru_gen(void)
984 +{
985 + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
986 + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
987 +
988 + return 0;
989 +};
990 +late_initcall(init_lru_gen);
991 +
992 +#endif /* CONFIG_LRU_GEN */
993 +
994 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
995 {
996 unsigned long nr[NR_LRU_LISTS];