target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch

   1 From 418038c22452df38cde519cc8c662bb15139764a Mon Sep 17 00:00:00 2001
   2 From: Kalesh Singh <kaleshsingh@google.com>
   3 Date: Thu, 13 Apr 2023 14:43:26 -0700
   4 Subject: [PATCH 19/19] mm: Multi-gen LRU: remove wait_event_killable()
   5
   6 Android 14 and later default to MGLRU [1] and field telemetry showed
   7 occasional long tail latency (>100ms) in the reclaim path.
   8
   9 Tracing revealed priority inversion in the reclaim path.  In
  10 try_to_inc_max_seq(), when high priority tasks were blocked on
  11 wait_event_killable(), the preemption of the low priority task to call
  12 wake_up_all() caused those high priority tasks to wait longer than
  13 necessary.  In general, this problem is not different from others of its
  14 kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU
  15 because it introduced the new wait queue lruvec->mm_state.wait.
  16
  17 The purpose of this new wait queue is to avoid the thundering herd
  18 problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only
  19 one can succeed, i.e., the one to wake up the rest, and the rest who
  20 failed might cause premature OOM kills if they do not wait.  So far there
  21 is no evidence supporting this scenario, based on how often the wait has
  22 been hit.  And this begs the question how useful the wait queue is in
  23 practice.
  24
  25 Based on Minchan's recommendation, which is in line with his commit
  26 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
  27 rest of the MGLRU code which also uses trylock when possible, remove the
  28 wait queue.
  29
  30 [1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
  31
  32 Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
  33 Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
  34 Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
  35 Suggested-by: Minchan Kim <minchan@kernel.org>
  36 Reported-by: Wei Wang <wvw@google.com>
  37 Acked-by: Yu Zhao <yuzhao@google.com>
  38 Cc: Minchan Kim <minchan@kernel.org>
  39 Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
  40 Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
  41 Cc: Suleiman Souhlal <suleiman@google.com>
  42 Cc: Suren Baghdasaryan <surenb@google.com>
  43 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  44 ---
  45  include/linux/mmzone.h |   8 +--
  46  mm/vmscan.c            | 112 +++++++++++++++--------------------------
  47  2 files changed, 42 insertions(+), 78 deletions(-)
  48
  49 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
  50 index 403c7461e7a70..d62a5accf1be4 100644
  51 --- a/include/linux/mmzone.h
  52 +++ b/include/linux/mmzone.h
  53 @@ -453,18 +453,14 @@ enum {
  54  struct lru_gen_mm_state {
  55         /* set to max_seq after each iteration */
  56         unsigned long seq;
  57 -       /* where the current iteration continues (inclusive) */
  58 +       /* where the current iteration continues after */
  59         struct list_head *head;
  60 -       /* where the last iteration ended (exclusive) */
  61 +       /* where the last iteration ended before */
  62         struct list_head *tail;
  63 -       /* to wait for the last page table walker to finish */
  64 -       struct wait_queue_head wait;
  65         /* Bloom filters flip after each iteration */
  66         unsigned long *filters[NR_BLOOM_FILTERS];
  67         /* the mm stats for debugging */
  68         unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
  69 -       /* the number of concurrent page table walkers */
  70 -       int nr_walkers;
  71  };
  72
  73  struct lru_gen_mm_walk {
  74 diff --git a/mm/vmscan.c b/mm/vmscan.c
  75 index f6ce7a1fd78a3..851758303dbf4 100644
  76 --- a/mm/vmscan.c
  77 +++ b/mm/vmscan.c
  78 @@ -3371,18 +3371,13 @@ void lru_gen_del_mm(struct mm_struct *mm)
  79                 if (!lruvec)
  80                         continue;
  81
  82 -               /* where the last iteration ended (exclusive) */
  83 +               /* where the current iteration continues after */
  84 +               if (lruvec->mm_state.head == &mm->lru_gen.list)
  85 +                       lruvec->mm_state.head = lruvec->mm_state.head->prev;
  86 +
  87 +               /* where the last iteration ended before */
  88                 if (lruvec->mm_state.tail == &mm->lru_gen.list)
  89                         lruvec->mm_state.tail = lruvec->mm_state.tail->next;
  90 -
  91 -               /* where the current iteration continues (inclusive) */
  92 -               if (lruvec->mm_state.head != &mm->lru_gen.list)
  93 -                       continue;
  94 -
  95 -               lruvec->mm_state.head = lruvec->mm_state.head->next;
  96 -               /* the deletion ends the current iteration */
  97 -               if (lruvec->mm_state.head == &mm_list->fifo)
  98 -                       WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
  99         }
 100
 101         list_del_init(&mm->lru_gen.list);
 102 @@ -3478,68 +3473,54 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 103                             struct mm_struct **iter)
 104  {
 105         bool first = false;
 106 -       bool last = true;
 107 +       bool last = false;
 108         struct mm_struct *mm = NULL;
 109         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 110         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 111         struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
 112
 113         /*
 114 -        * There are four interesting cases for this page table walker:
 115 -        * 1. It tries to start a new iteration of mm_list with a stale max_seq;
 116 -        *    there is nothing left to do.
 117 -        * 2. It's the first of the current generation, and it needs to reset
 118 -        *    the Bloom filter for the next generation.
 119 -        * 3. It reaches the end of mm_list, and it needs to increment
 120 -        *    mm_state->seq; the iteration is done.
 121 -        * 4. It's the last of the current generation, and it needs to reset the
 122 -        *    mm stats counters for the next generation.
 123 +        * mm_state->seq is incremented after each iteration of mm_list. There
 124 +        * are three interesting cases for this page table walker:
 125 +        * 1. It tries to start a new iteration with a stale max_seq: there is
 126 +        *    nothing left to do.
 127 +        * 2. It started the next iteration: it needs to reset the Bloom filter
 128 +        *    so that a fresh set of PTE tables can be recorded.
 129 +        * 3. It ended the current iteration: it needs to reset the mm stats
 130 +        *    counters and tell its caller to increment max_seq.
 131          */
 132         spin_lock(&mm_list->lock);
 133
 134         VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
 135 -       VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
 136 -       VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
 137
 138 -       if (walk->max_seq <= mm_state->seq) {
 139 -               if (!*iter)
 140 -                       last = false;
 141 +       if (walk->max_seq <= mm_state->seq)
 142                 goto done;
 143 -       }
 144
 145 -       if (!mm_state->nr_walkers) {
 146 -               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 147 +       if (!mm_state->head)
 148 +               mm_state->head = &mm_list->fifo;
 149
 150 -               mm_state->head = mm_list->fifo.next;
 151 +       if (mm_state->head == &mm_list->fifo)
 152                 first = true;
 153 -       }
 154 -
 155 -       while (!mm && mm_state->head != &mm_list->fifo) {
 156 -               mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 157
 158 +       do {
 159                 mm_state->head = mm_state->head->next;
 160 +               if (mm_state->head == &mm_list->fifo) {
 161 +                       WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 162 +                       last = true;
 163 +                       break;
 164 +               }
 165
 166                 /* force scan for those added after the last iteration */
 167 -               if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
 168 -                       mm_state->tail = mm_state->head;
 169 +               if (!mm_state->tail || mm_state->tail == mm_state->head) {
 170 +                       mm_state->tail = mm_state->head->next;
 171                         walk->force_scan = true;
 172                 }
 173
 174 +               mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 175                 if (should_skip_mm(mm, walk))
 176                         mm = NULL;
 177 -       }
 178 -
 179 -       if (mm_state->head == &mm_list->fifo)
 180 -               WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 181 +       } while (!mm);
 182  done:
 183 -       if (*iter && !mm)
 184 -               mm_state->nr_walkers--;
 185 -       if (!*iter && mm)
 186 -               mm_state->nr_walkers++;
 187 -
 188 -       if (mm_state->nr_walkers)
 189 -               last = false;
 190 -
 191         if (*iter || last)
 192                 reset_mm_stats(lruvec, walk, last);
 193
 194 @@ -3567,9 +3548,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 195
 196         VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
 197
 198 -       if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
 199 -               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 200 -
 201 +       if (max_seq > mm_state->seq) {
 202 +               mm_state->head = NULL;
 203 +               mm_state->tail = NULL;
 204                 WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 205                 reset_mm_stats(lruvec, NULL, true);
 206                 success = true;
 207 @@ -4172,10 +4153,6 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
 208
 209                 walk_pmd_range(&val, addr, next, args);
 210
 211 -               /* a racy check to curtail the waiting time */
 212 -               if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
 213 -                       return 1;
 214 -
 215                 if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
 216                         end = (addr | ~PUD_MASK) + 1;
 217                         goto done;
 218 @@ -4208,8 +4185,14 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
 219         walk->next_addr = FIRST_USER_ADDRESS;
 220
 221         do {
 222 +               DEFINE_MAX_SEQ(lruvec);
 223 +
 224                 err = -EBUSY;
 225
 226 +               /* another thread might have called inc_max_seq() */
 227 +               if (walk->max_seq != max_seq)
 228 +                       break;
 229 +
 230                 /* folio_update_gen() requires stable folio_memcg() */
 231                 if (!mem_cgroup_trylock_pages(memcg))
 232                         break;
 233 @@ -4442,25 +4425,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 234                 success = iterate_mm_list(lruvec, walk, &mm);
 235                 if (mm)
 236                         walk_mm(lruvec, mm, walk);
 237 -
 238 -               cond_resched();
 239         } while (mm);
 240  done:
 241 -       if (!success) {
 242 -               if (sc->priority <= DEF_PRIORITY - 2)
 243 -                       wait_event_killable(lruvec->mm_state.wait,
 244 -                                           max_seq < READ_ONCE(lrugen->max_seq));
 245 -               return false;
 246 -       }
 247 +       if (success)
 248 +               inc_max_seq(lruvec, can_swap, force_scan);
 249
 250 -       VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
 251 -
 252 -       inc_max_seq(lruvec, can_swap, force_scan);
 253 -       /* either this sees any waiters or they will see updated max_seq */
 254 -       if (wq_has_sleeper(&lruvec->mm_state.wait))
 255 -               wake_up_all(&lruvec->mm_state.wait);
 256 -
 257 -       return true;
 258 +       return success;
 259  }
 260
 261  /******************************************************************************
 262 @@ -6105,7 +6075,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 263                 INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
 264
 265         lruvec->mm_state.seq = MIN_NR_GENS;
 266 -       init_waitqueue_head(&lruvec->mm_state.wait);
 267  }
 268
 269  #ifdef CONFIG_MEMCG
 270 @@ -6138,7 +6107,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 271         for_each_node(nid) {
 272                 struct lruvec *lruvec = get_lruvec(memcg, nid);
 273
 274 -               VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
 275                 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
 276                                            sizeof(lruvec->lrugen.nr_pages)));
 277
 278 --
 279 2.40.1
 280