8f699626402b68bdb61bd4eaad39f57c78192bf3
[openwrt/openwrt.git] / target / linux / bcm27xx / patches-5.15 / 950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch
1 From a09f1129301f85dfb2f800c6dab2a5f8acac8f27 Mon Sep 17 00:00:00 2001
2 From: Maxime Ripard <maxime@cerno.tech>
3 Date: Thu, 16 Dec 2021 14:54:54 +0100
4 Subject: [PATCH] drm/vc4: hvs: Defer dlist slots deallocation
5
6 During normal operations, the cursor position update is done through an
7 asynchronous plane update, which on the vc4 driver basically just
8 modifies the right dlist word to move the plane to the new coordinates.
9
10 However, when we have the overscan margins setup, we fall back to a
11 regular commit when we are next to the edges. And since that commit
12 happens to be on a cursor plane, it's considered a legacy cursor update
13 by KMS.
14
15 The main difference it makes is that it won't wait for its completion
16 (ie, next vblank) before returning. This means if we have multiple
17 commits happening in rapid succession, we can have several of them
18 happening before the next vblank.
19
20 In parallel, our dlist allocation is tied to a CRTC state, and each time
21 we do a commit we end up with a new CRTC state, with the previous one
22 being freed. This means that we free our previous dlist entry (but don't
23 clear it though) every time a new one is being committed.
24
25 Now, if we were to have two commits happening before the next vblank, we
26 could end up freeing reusing the same dlist entries before the next
27 vblank.
28
29 Indeed, we would start from an initial state taking, for example, the
30 dlist entries 10 to 20, then start a commit taking the entries 20 to 30
31 and setting the dlist pointer to 20, and freeing the dlist entries 10 to
32 20. However, since we haven't reach vblank yet, the HVS is still using
33 the entries 10 to 20.
34
35 If we were to make a new commit now, chances are the allocator are going
36 to give the 10 to 20 entries back, and we would change their content to
37 match the new state. If vblank hasn't happened yet, we just corrupted
38 the active dlist entries.
39
40 A first attempt to solve this was made by creating an intermediate dlist
41 buffer to store the current (ie, as of the last commit) dlist content,
42 that we would update each time the HVS is done with a frame. However, if
43 the interrupt handler missed the vblank window, we would end up copying
44 our intermediate dlist to the hardware one during the composition,
45 essentially creating the same issue.
46
47 Since making sure that our interrupt handler runs within a fixed,
48 constrained, time window would require to make Linux a real-time kernel,
49 this seems a bit out of scope.
50
51 Instead, we can work around our original issue by keeping the dlist
52 slots allocation longer. That way, we won't reuse a dlist slot while
53 it's still in flight. In order to achieve this, instead of freeing the
54 dlist slot when its associated CRTC state is destroyed, we'll queue it
55 in a list.
56
57 A naive implementation would free the buffers in that queue when we get
58 our end of frame interrupt. However, there's still a race since, just
59 like in the shadow dlist case, we don't control when the handler for
60 that interrupt is going to run. Thus, we can end up with a commit adding
61 an old dlist allocation to our queue during the window between our
62 actual interrupt and when our handler will run. And since that buffer is
63 still being used for the composition of the current frame, we can't free
64 it right away, exposing us to the original bug.
65
66 Fortunately for us, the hardware provides a frame counter that is
67 increased each time the first line of a frame is being generated.
68 Associating the frame counter the image is supposed to go away to the
69 allocation, and then only deallocate buffers that have a counter below
70 or equal to the one we see when the deallocation code should prevent the
71 above race from occuring.
72
73 Signed-off-by: Maxime Ripard <maxime@cerno.tech>
74 ---
75 drivers/gpu/drm/vc4/vc4_crtc.c | 10 +-
76 drivers/gpu/drm/vc4/vc4_drv.h | 15 ++-
77 drivers/gpu/drm/vc4/vc4_hvs.c | 181 ++++++++++++++++++++++++++++++---
78 drivers/gpu/drm/vc4/vc4_regs.h | 1 +
79 4 files changed, 184 insertions(+), 23 deletions(-)
80
81 --- a/drivers/gpu/drm/vc4/vc4_crtc.c
82 +++ b/drivers/gpu/drm/vc4/vc4_crtc.c
83 @@ -943,14 +943,8 @@ void vc4_crtc_destroy_state(struct drm_c
84 struct vc4_dev *vc4 = to_vc4_dev(crtc->dev);
85 struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(state);
86
87 - if (drm_mm_node_allocated(&vc4_state->mm)) {
88 - unsigned long flags;
89 -
90 - spin_lock_irqsave(&vc4->hvs->mm_lock, flags);
91 - drm_mm_remove_node(&vc4_state->mm);
92 - spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags);
93 -
94 - }
95 + vc4_hvs_mark_dlist_entry_stale(vc4->hvs, vc4_state->mm);
96 + vc4_state->mm = NULL;
97
98 drm_atomic_helper_crtc_destroy_state(crtc, state);
99 }
100 --- a/drivers/gpu/drm/vc4/vc4_drv.h
101 +++ b/drivers/gpu/drm/vc4/vc4_drv.h
102 @@ -335,6 +335,9 @@ struct vc4_hvs {
103 struct drm_mm lbm_mm;
104 spinlock_t mm_lock;
105
106 + struct list_head stale_dlist_entries;
107 + struct work_struct free_dlist_work;
108 +
109 struct drm_mm_node mitchell_netravali_filter;
110
111 struct debugfs_regset32 regset;
112 @@ -573,10 +576,16 @@ struct drm_connector *vc4_get_crtc_conne
113 struct drm_encoder *vc4_get_crtc_encoder(struct drm_crtc *crtc,
114 struct drm_crtc_state *state);
115
116 +struct vc4_hvs_dlist_allocation {
117 + struct list_head node;
118 + struct drm_mm_node mm_node;
119 + unsigned int channel;
120 + u8 target_frame_count;
121 +};
122 +
123 struct vc4_crtc_state {
124 struct drm_crtc_state base;
125 - /* Dlist area for this CRTC configuration. */
126 - struct drm_mm_node mm;
127 + struct vc4_hvs_dlist_allocation *mm;
128 bool txp_armed;
129 unsigned int assigned_channel;
130
131 @@ -968,6 +977,8 @@ extern struct platform_driver vc4_hvs_dr
132 void vc4_hvs_stop_channel(struct vc4_hvs *hvs, unsigned int output);
133 int vc4_hvs_get_fifo_from_output(struct vc4_hvs *hvs, unsigned int output);
134 u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo);
135 +void vc4_hvs_mark_dlist_entry_stale(struct vc4_hvs *hvs,
136 + struct vc4_hvs_dlist_allocation *alloc);
137 int vc4_hvs_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state);
138 void vc4_hvs_atomic_begin(struct drm_crtc *crtc, struct drm_atomic_state *state);
139 void vc4_hvs_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_state *state);
140 --- a/drivers/gpu/drm/vc4/vc4_hvs.c
141 +++ b/drivers/gpu/drm/vc4/vc4_hvs.c
142 @@ -315,6 +315,150 @@ static void vc4_hvs_update_gamma_lut(str
143 vc4_hvs_lut_load(hvs, vc4_crtc);
144 }
145
146 +static void vc4_hvs_irq_enable_eof(const struct vc4_hvs *hvs,
147 + unsigned int channel)
148 +{
149 + u32 irq_mask = hvs->hvs5 ?
150 + SCALER5_DISPCTRL_DSPEIEOF(channel) :
151 + SCALER_DISPCTRL_DSPEIEOF(channel);
152 +
153 + HVS_WRITE(SCALER_DISPCTRL,
154 + HVS_READ(SCALER_DISPCTRL) | irq_mask);
155 +}
156 +
157 +static void vc4_hvs_irq_clear_eof(const struct vc4_hvs *hvs,
158 + unsigned int channel)
159 +{
160 + u32 irq_mask = hvs->hvs5 ?
161 + SCALER5_DISPCTRL_DSPEIEOF(channel) :
162 + SCALER_DISPCTRL_DSPEIEOF(channel);
163 +
164 + HVS_WRITE(SCALER_DISPCTRL,
165 + HVS_READ(SCALER_DISPCTRL) & ~irq_mask);
166 +}
167 +
168 +static struct vc4_hvs_dlist_allocation *
169 +vc4_hvs_alloc_dlist_entry(struct vc4_hvs *hvs,
170 + unsigned int channel,
171 + size_t dlist_count)
172 +{
173 + struct vc4_hvs_dlist_allocation *alloc;
174 + unsigned long flags;
175 + int ret;
176 +
177 + if (channel == VC4_HVS_CHANNEL_DISABLED)
178 + return NULL;
179 +
180 + alloc = kzalloc(sizeof(*alloc), GFP_KERNEL);
181 + if (!alloc)
182 + return ERR_PTR(-ENOMEM);
183 +
184 + spin_lock_irqsave(&hvs->mm_lock, flags);
185 + ret = drm_mm_insert_node(&hvs->dlist_mm, &alloc->mm_node,
186 + dlist_count);
187 + spin_unlock_irqrestore(&hvs->mm_lock, flags);
188 + if (ret)
189 + return ERR_PTR(ret);
190 +
191 + alloc->channel = channel;
192 +
193 + return alloc;
194 +}
195 +
196 +void vc4_hvs_mark_dlist_entry_stale(struct vc4_hvs *hvs,
197 + struct vc4_hvs_dlist_allocation *alloc)
198 +{
199 + unsigned long flags;
200 + u8 frcnt;
201 +
202 + if (!alloc)
203 + return;
204 +
205 + if (!drm_mm_node_allocated(&alloc->mm_node))
206 + return;
207 +
208 + frcnt = vc4_hvs_get_fifo_frame_count(hvs, alloc->channel);
209 + alloc->target_frame_count = (frcnt + 1) & ((1 << 6) - 1);
210 +
211 + spin_lock_irqsave(&hvs->mm_lock, flags);
212 +
213 + list_add_tail(&alloc->node, &hvs->stale_dlist_entries);
214 +
215 + HVS_WRITE(SCALER_DISPSTAT, SCALER_DISPSTAT_EOF(alloc->channel));
216 + vc4_hvs_irq_enable_eof(hvs, alloc->channel);
217 +
218 + spin_unlock_irqrestore(&hvs->mm_lock, flags);
219 +}
220 +
221 +static void vc4_hvs_schedule_dlist_sweep(struct vc4_hvs *hvs,
222 + unsigned int channel)
223 +{
224 + unsigned long flags;
225 +
226 + spin_lock_irqsave(&hvs->mm_lock, flags);
227 +
228 + if (!list_empty(&hvs->stale_dlist_entries))
229 + queue_work(system_unbound_wq, &hvs->free_dlist_work);
230 +
231 + vc4_hvs_irq_clear_eof(hvs, channel);
232 +
233 + spin_unlock_irqrestore(&hvs->mm_lock, flags);
234 +}
235 +
236 +/*
237 + * Frame counts are essentially sequence numbers over 6 bits, and we
238 + * thus can use sequence number arithmetic and follow the RFC1982 to
239 + * implement proper comparison between them.
240 + */
241 +static bool vc4_hvs_frcnt_lte(u8 cnt1, u8 cnt2)
242 +{
243 + return (s8)((cnt1 << 2) - (cnt2 << 2)) <= 0;
244 +}
245 +
246 +/*
247 + * Some atomic commits (legacy cursor updates, mostly) will not wait for
248 + * the next vblank and will just return once the commit has been pushed
249 + * to the hardware.
250 + *
251 + * On the hardware side, our HVS stores the planes parameters in its
252 + * context RAM, and will use part of the RAM to store data during the
253 + * frame rendering.
254 + *
255 + * This interacts badly if we get multiple commits before the next
256 + * vblank since we could end up overwriting the DLIST entries used by
257 + * previous commits if our dlist allocation reuses that entry. In such a
258 + * case, we would overwrite the data currently being used by the
259 + * hardware, resulting in a corrupted frame.
260 + *
261 + * In order to work around this, we'll queue the dlist entries in a list
262 + * once the associated CRTC state is destroyed. The HVS only allows us
263 + * to know which entry is being active, but not which one are no longer
264 + * being used, so in order to avoid freeing entries that are still used
265 + * by the hardware we add a guesstimate of the frame count where our
266 + * entry will no longer be used, and thus will only free those entries
267 + * when we will have reached that frame count.
268 + */
269 +static void vc4_hvs_dlist_free_work(struct work_struct *work)
270 +{
271 + struct vc4_hvs *hvs = container_of(work, struct vc4_hvs, free_dlist_work);
272 + struct vc4_hvs_dlist_allocation *cur, *next;
273 + unsigned long flags;
274 +
275 + spin_lock_irqsave(&hvs->mm_lock, flags);
276 + list_for_each_entry_safe(cur, next, &hvs->stale_dlist_entries, node) {
277 + u8 frcnt;
278 +
279 + frcnt = vc4_hvs_get_fifo_frame_count(hvs, cur->channel);
280 + if (!vc4_hvs_frcnt_lte(cur->target_frame_count, frcnt))
281 + continue;
282 +
283 + list_del(&cur->node);
284 + drm_mm_remove_node(&cur->mm_node);
285 + kfree(cur);
286 + }
287 + spin_unlock_irqrestore(&hvs->mm_lock, flags);
288 +}
289 +
290 u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo)
291 {
292 u8 field = 0;
293 @@ -588,13 +732,12 @@ int vc4_hvs_atomic_check(struct drm_crtc
294 {
295 struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc);
296 struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc_state);
297 + struct vc4_hvs_dlist_allocation *alloc;
298 struct drm_device *dev = crtc->dev;
299 struct vc4_dev *vc4 = to_vc4_dev(dev);
300 struct drm_plane *plane;
301 - unsigned long flags;
302 const struct drm_plane_state *plane_state;
303 u32 dlist_count = 0;
304 - int ret;
305
306 /* The pixelvalve can only feed one encoder (and encoders are
307 * 1:1 with connectors.)
308 @@ -607,12 +750,11 @@ int vc4_hvs_atomic_check(struct drm_crtc
309
310 dlist_count++; /* Account for SCALER_CTL0_END. */
311
312 - spin_lock_irqsave(&vc4->hvs->mm_lock, flags);
313 - ret = drm_mm_insert_node(&vc4->hvs->dlist_mm, &vc4_state->mm,
314 - dlist_count);
315 - spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags);
316 - if (ret)
317 - return ret;
318 + alloc = vc4_hvs_alloc_dlist_entry(vc4->hvs, vc4_state->assigned_channel, dlist_count);
319 + if (IS_ERR(alloc))
320 + return PTR_ERR(alloc);
321 +
322 + vc4_state->mm = alloc;
323
324 return vc4_hvs_gamma_check(crtc, state);
325 }
326 @@ -624,8 +766,9 @@ static void vc4_hvs_install_dlist(struct
327 struct vc4_hvs *hvs = vc4->hvs;
328 struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc->state);
329
330 + WARN_ON(!vc4_state->mm);
331 HVS_WRITE(SCALER_DISPLISTX(vc4_state->assigned_channel),
332 - vc4_state->mm.start);
333 + vc4_state->mm->mm_node.start);
334 }
335
336 static void vc4_hvs_update_dlist(struct drm_crtc *crtc)
337 @@ -650,8 +793,10 @@ static void vc4_hvs_update_dlist(struct
338 spin_unlock_irqrestore(&dev->event_lock, flags);
339 }
340
341 + WARN_ON(!vc4_state->mm);
342 +
343 spin_lock_irqsave(&vc4_crtc->irq_lock, flags);
344 - vc4_crtc->current_dlist = vc4_state->mm.start;
345 + vc4_crtc->current_dlist = vc4_state->mm->mm_node.start;
346 spin_unlock_irqrestore(&vc4_crtc->irq_lock, flags);
347 }
348
349 @@ -708,8 +853,7 @@ void vc4_hvs_atomic_flush(struct drm_crt
350 struct vc4_plane_state *vc4_plane_state;
351 bool debug_dump_regs = false;
352 bool enable_bg_fill = false;
353 - u32 __iomem *dlist_start = vc4->hvs->dlist + vc4_state->mm.start;
354 - u32 __iomem *dlist_next = dlist_start;
355 + u32 __iomem *dlist_start, *dlist_next;
356
357 if (vc4_state->assigned_channel == VC4_HVS_CHANNEL_DISABLED)
358 return;
359 @@ -719,6 +863,9 @@ void vc4_hvs_atomic_flush(struct drm_crt
360 vc4_hvs_dump_state(hvs);
361 }
362
363 + dlist_start = vc4->hvs->dlist + vc4_state->mm->mm_node.start;
364 + dlist_next = dlist_start;
365 +
366 /* Copy all the active planes' dlist contents to the hardware dlist. */
367 drm_atomic_crtc_for_each_plane(plane, crtc) {
368 /* Is this the first active plane? */
369 @@ -741,7 +888,8 @@ void vc4_hvs_atomic_flush(struct drm_crt
370 writel(SCALER_CTL0_END, dlist_next);
371 dlist_next++;
372
373 - WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm.size);
374 + WARN_ON(!vc4_state->mm);
375 + WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm->mm_node.size);
376
377 if (enable_bg_fill)
378 /* This sets a black background color fill, as is the case
379 @@ -846,6 +994,11 @@ static irqreturn_t vc4_hvs_irq_handler(i
380
381 irqret = IRQ_HANDLED;
382 }
383 +
384 + if (status & SCALER_DISPSTAT_EOF(channel)) {
385 + vc4_hvs_schedule_dlist_sweep(hvs, channel);
386 + irqret = IRQ_HANDLED;
387 + }
388 }
389
390 /* Clear every per-channel interrupt flag. */
391 @@ -903,6 +1056,8 @@ static int vc4_hvs_bind(struct device *d
392 hvs->dlist = hvs->regs + SCALER5_DLIST_START;
393
394 spin_lock_init(&hvs->mm_lock);
395 + INIT_LIST_HEAD(&hvs->stale_dlist_entries);
396 + INIT_WORK(&hvs->free_dlist_work, vc4_hvs_dlist_free_work);
397
398 /* Set up the HVS display list memory manager. We never
399 * overwrite the setup from the bootloader (just 128b out of
400 --- a/drivers/gpu/drm/vc4/vc4_regs.h
401 +++ b/drivers/gpu/drm/vc4/vc4_regs.h
402 @@ -234,6 +234,7 @@
403 # define SCALER_DISPCTRL_DSPEIEOLN(x) BIT(8 + ((x) * 2))
404 /* Enables Display 0 EOF contribution to SCALER_DISPSTAT_IRQDISP0 */
405 # define SCALER_DISPCTRL_DSPEIEOF(x) BIT(7 + ((x) * 2))
406 +# define SCALER5_DISPCTRL_DSPEIEOF(x) BIT(7 + ((x) * 4))
407
408 # define SCALER_DISPCTRL_SLVRDEIRQ BIT(6)
409 # define SCALER_DISPCTRL_SLVWREIRQ BIT(5)