brcm2708: update linux 4.4 patches to latest version
[openwrt/openwrt.git] / target / linux / brcm2708 / patches-4.4 / 0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch
1 From 346367864363fa323ee502d9e8fb36b964cbbdb0 Mon Sep 17 00:00:00 2001
2 From: Varad Gautam <varadgautam@gmail.com>
3 Date: Wed, 17 Feb 2016 19:08:21 +0530
4 Subject: [PATCH 290/423] drm/vc4: improve throughput by pipelining binning and
5 rendering jobs
6
7 The hardware provides us with separate threads for binning and
8 rendering, and the existing model waits for them both to complete
9 before submitting the next job.
10
11 Splitting the binning and rendering submissions reduces idle time and
12 gives us approx 20-30% speedup with some x11perf tests such as -line10
13 and -tilerect1. Improves openarena performance by 1.01897% +/-
14 0.247857% (n=16).
15
16 Thanks to anholt for suggesting this.
17
18 v2: Rebase on the spurious resets fix (change by anholt).
19
20 Signed-off-by: Varad Gautam <varadgautam@gmail.com>
21 Reviewed-by: Eric Anholt <eric@anholt.net>
22 Signed-off-by: Eric Anholt <eric@anholt.net>
23 (cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207)
24 ---
25 drivers/gpu/drm/vc4/vc4_drv.h | 37 +++++++++----
26 drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------
27 drivers/gpu/drm/vc4/vc4_irq.c | 58 ++++++++++++++++----
28 3 files changed, 166 insertions(+), 52 deletions(-)
29
30 --- a/drivers/gpu/drm/vc4/vc4_drv.h
31 +++ b/drivers/gpu/drm/vc4/vc4_drv.h
32 @@ -53,7 +53,7 @@ struct vc4_dev {
33 /* Protects bo_cache and the BO stats. */
34 struct mutex bo_lock;
35
36 - /* Sequence number for the last job queued in job_list.
37 + /* Sequence number for the last job queued in bin_job_list.
38 * Starts at 0 (no jobs emitted).
39 */
40 uint64_t emit_seqno;
41 @@ -63,11 +63,19 @@ struct vc4_dev {
42 */
43 uint64_t finished_seqno;
44
45 - /* List of all struct vc4_exec_info for jobs to be executed.
46 - * The first job in the list is the one currently programmed
47 - * into ct0ca/ct1ca for execution.
48 + /* List of all struct vc4_exec_info for jobs to be executed in
49 + * the binner. The first job in the list is the one currently
50 + * programmed into ct0ca for execution.
51 + */
52 + struct list_head bin_job_list;
53 +
54 + /* List of all struct vc4_exec_info for jobs that have
55 + * completed binning and are ready for rendering. The first
56 + * job in the list is the one currently programmed into ct1ca
57 + * for execution.
58 */
59 - struct list_head job_list;
60 + struct list_head render_job_list;
61 +
62 /* List of the finished vc4_exec_infos waiting to be freed by
63 * job_done_work.
64 */
65 @@ -291,11 +299,20 @@ struct vc4_exec_info {
66 };
67
68 static inline struct vc4_exec_info *
69 -vc4_first_job(struct vc4_dev *vc4)
70 +vc4_first_bin_job(struct vc4_dev *vc4)
71 +{
72 + if (list_empty(&vc4->bin_job_list))
73 + return NULL;
74 + return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head);
75 +}
76 +
77 +static inline struct vc4_exec_info *
78 +vc4_first_render_job(struct vc4_dev *vc4)
79 {
80 - if (list_empty(&vc4->job_list))
81 + if (list_empty(&vc4->render_job_list))
82 return NULL;
83 - return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
84 + return list_first_entry(&vc4->render_job_list,
85 + struct vc4_exec_info, head);
86 }
87
88 /**
89 @@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi
90 struct drm_file *file_priv);
91 int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
92 struct drm_file *file_priv);
93 -void vc4_submit_next_job(struct drm_device *dev);
94 +void vc4_submit_next_bin_job(struct drm_device *dev);
95 +void vc4_submit_next_render_job(struct drm_device *dev);
96 +void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec);
97 int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
98 uint64_t timeout_ns, bool interruptible);
99 void vc4_job_handle_completed(struct vc4_dev *vc4);
100 --- a/drivers/gpu/drm/vc4/vc4_gem.c
101 +++ b/drivers/gpu/drm/vc4/vc4_gem.c
102 @@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d
103 struct vc4_dev *vc4 = to_vc4_dev(dev);
104 struct drm_vc4_get_hang_state *state;
105 struct vc4_hang_state *kernel_state;
106 - struct vc4_exec_info *exec;
107 + struct vc4_exec_info *exec[2];
108 struct vc4_bo *bo;
109 unsigned long irqflags;
110 - unsigned int i, unref_list_count;
111 + unsigned int i, j, unref_list_count, prev_idx;
112
113 kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
114 if (!kernel_state)
115 @@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d
116 state = &kernel_state->user_state;
117
118 spin_lock_irqsave(&vc4->job_lock, irqflags);
119 - exec = vc4_first_job(vc4);
120 - if (!exec) {
121 + exec[0] = vc4_first_bin_job(vc4);
122 + exec[1] = vc4_first_render_job(vc4);
123 + if (!exec[0] && !exec[1]) {
124 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
125 return;
126 }
127
128 - unref_list_count = 0;
129 - list_for_each_entry(bo, &exec->unref_list, unref_head)
130 - unref_list_count++;
131 -
132 - state->bo_count = exec->bo_count + unref_list_count;
133 - kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
134 - GFP_ATOMIC);
135 + /* Get the bos from both binner and renderer into hang state. */
136 + state->bo_count = 0;
137 + for (i = 0; i < 2; i++) {
138 + if (!exec[i])
139 + continue;
140 +
141 + unref_list_count = 0;
142 + list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
143 + unref_list_count++;
144 + state->bo_count += exec[i]->bo_count + unref_list_count;
145 + }
146 +
147 + kernel_state->bo = kcalloc(state->bo_count,
148 + sizeof(*kernel_state->bo), GFP_ATOMIC);
149 +
150 if (!kernel_state->bo) {
151 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
152 return;
153 }
154
155 - for (i = 0; i < exec->bo_count; i++) {
156 - drm_gem_object_reference(&exec->bo[i]->base);
157 - kernel_state->bo[i] = &exec->bo[i]->base;
158 - }
159 + prev_idx = 0;
160 + for (i = 0; i < 2; i++) {
161 + if (!exec[i])
162 + continue;
163 +
164 + for (j = 0; j < exec[i]->bo_count; j++) {
165 + drm_gem_object_reference(&exec[i]->bo[j]->base);
166 + kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
167 + }
168
169 - list_for_each_entry(bo, &exec->unref_list, unref_head) {
170 - drm_gem_object_reference(&bo->base.base);
171 - kernel_state->bo[i] = &bo->base.base;
172 - i++;
173 + list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
174 + drm_gem_object_reference(&bo->base.base);
175 + kernel_state->bo[j + prev_idx] = &bo->base.base;
176 + j++;
177 + }
178 + prev_idx = j + 1;
179 }
180
181 - state->start_bin = exec->ct0ca;
182 - state->start_render = exec->ct1ca;
183 + if (exec[0])
184 + state->start_bin = exec[0]->ct0ca;
185 + if (exec[1])
186 + state->start_render = exec[1]->ct1ca;
187
188 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
189
190 @@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data
191 struct vc4_dev *vc4 = to_vc4_dev(dev);
192 uint32_t ct0ca, ct1ca;
193 unsigned long irqflags;
194 - struct vc4_exec_info *exec;
195 + struct vc4_exec_info *bin_exec, *render_exec;
196
197 spin_lock_irqsave(&vc4->job_lock, irqflags);
198 - exec = vc4_first_job(vc4);
199 +
200 + bin_exec = vc4_first_bin_job(vc4);
201 + render_exec = vc4_first_render_job(vc4);
202
203 /* If idle, we can stop watching for hangs. */
204 - if (!exec) {
205 + if (!bin_exec && !render_exec) {
206 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
207 return;
208 }
209 @@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data
210 /* If we've made any progress in execution, rearm the timer
211 * and wait.
212 */
213 - if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) {
214 - exec->last_ct0ca = ct0ca;
215 - exec->last_ct1ca = ct1ca;
216 + if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
217 + (render_exec && ct1ca != render_exec->last_ct1ca)) {
218 + if (bin_exec)
219 + bin_exec->last_ct0ca = ct0ca;
220 + if (render_exec)
221 + render_exec->last_ct1ca = ct1ca;
222 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
223 vc4_queue_hangcheck(dev);
224 return;
225 @@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev)
226 * The job_lock should be held during this.
227 */
228 void
229 -vc4_submit_next_job(struct drm_device *dev)
230 +vc4_submit_next_bin_job(struct drm_device *dev)
231 {
232 struct vc4_dev *vc4 = to_vc4_dev(dev);
233 - struct vc4_exec_info *exec = vc4_first_job(vc4);
234 + struct vc4_exec_info *exec;
235
236 +again:
237 + exec = vc4_first_bin_job(vc4);
238 if (!exec)
239 return;
240
241 @@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d
242 V3D_WRITE(V3D_BPOA, 0);
243 V3D_WRITE(V3D_BPOS, 0);
244
245 - if (exec->ct0ca != exec->ct0ea)
246 + /* Either put the job in the binner if it uses the binner, or
247 + * immediately move it to the to-be-rendered queue.
248 + */
249 + if (exec->ct0ca != exec->ct0ea) {
250 submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
251 + } else {
252 + vc4_move_job_to_render(dev, exec);
253 + goto again;
254 + }
255 +}
256 +
257 +void
258 +vc4_submit_next_render_job(struct drm_device *dev)
259 +{
260 + struct vc4_dev *vc4 = to_vc4_dev(dev);
261 + struct vc4_exec_info *exec = vc4_first_render_job(vc4);
262 +
263 + if (!exec)
264 + return;
265 +
266 submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
267 }
268
269 +void
270 +vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
271 +{
272 + struct vc4_dev *vc4 = to_vc4_dev(dev);
273 + bool was_empty = list_empty(&vc4->render_job_list);
274 +
275 + list_move_tail(&exec->head, &vc4->render_job_list);
276 + if (was_empty)
277 + vc4_submit_next_render_job(dev);
278 +}
279 +
280 static void
281 vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
282 {
283 @@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev,
284 exec->seqno = seqno;
285 vc4_update_bo_seqnos(exec, seqno);
286
287 - list_add_tail(&exec->head, &vc4->job_list);
288 + list_add_tail(&exec->head, &vc4->bin_job_list);
289
290 /* If no job was executing, kick ours off. Otherwise, it'll
291 - * get started when the previous job's frame done interrupt
292 + * get started when the previous job's flush done interrupt
293 * occurs.
294 */
295 - if (vc4_first_job(vc4) == exec) {
296 - vc4_submit_next_job(dev);
297 + if (vc4_first_bin_job(vc4) == exec) {
298 + vc4_submit_next_bin_job(dev);
299 vc4_queue_hangcheck(dev);
300 }
301
302 @@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev)
303 {
304 struct vc4_dev *vc4 = to_vc4_dev(dev);
305
306 - INIT_LIST_HEAD(&vc4->job_list);
307 + INIT_LIST_HEAD(&vc4->bin_job_list);
308 + INIT_LIST_HEAD(&vc4->render_job_list);
309 INIT_LIST_HEAD(&vc4->job_done_list);
310 INIT_LIST_HEAD(&vc4->seqno_cb_list);
311 spin_lock_init(&vc4->job_lock);
312 --- a/drivers/gpu/drm/vc4/vc4_irq.c
313 +++ b/drivers/gpu/drm/vc4/vc4_irq.c
314 @@ -30,6 +30,10 @@
315 * disables that specific interrupt, and 0s written are ignored
316 * (reading either one returns the set of enabled interrupts).
317 *
318 + * When we take a binning flush done interrupt, we need to submit the
319 + * next frame for binning and move the finished frame to the render
320 + * thread.
321 + *
322 * When we take a render frame interrupt, we need to wake the
323 * processes waiting for some frame to be done, and get the next frame
324 * submitted ASAP (so the hardware doesn't sit idle when there's work
325 @@ -44,6 +48,7 @@
326 #include "vc4_regs.h"
327
328 #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
329 + V3D_INT_FLDONE | \
330 V3D_INT_FRDONE)
331
332 DECLARE_WAIT_QUEUE_HEAD(render_wait);
333 @@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct
334 unsigned long irqflags;
335
336 spin_lock_irqsave(&vc4->job_lock, irqflags);
337 - current_exec = vc4_first_job(vc4);
338 + current_exec = vc4_first_bin_job(vc4);
339 if (current_exec) {
340 vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
341 list_add_tail(&vc4->overflow_mem->unref_head,
342 @@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct
343 }
344
345 static void
346 -vc4_irq_finish_job(struct drm_device *dev)
347 +vc4_irq_finish_bin_job(struct drm_device *dev)
348 +{
349 + struct vc4_dev *vc4 = to_vc4_dev(dev);
350 + struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
351 +
352 + if (!exec)
353 + return;
354 +
355 + vc4_move_job_to_render(dev, exec);
356 + vc4_submit_next_bin_job(dev);
357 +}
358 +
359 +static void
360 +vc4_cancel_bin_job(struct drm_device *dev)
361 +{
362 + struct vc4_dev *vc4 = to_vc4_dev(dev);
363 + struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
364 +
365 + if (!exec)
366 + return;
367 +
368 + list_move_tail(&exec->head, &vc4->bin_job_list);
369 + vc4_submit_next_bin_job(dev);
370 +}
371 +
372 +static void
373 +vc4_irq_finish_render_job(struct drm_device *dev)
374 {
375 struct vc4_dev *vc4 = to_vc4_dev(dev);
376 - struct vc4_exec_info *exec = vc4_first_job(vc4);
377 + struct vc4_exec_info *exec = vc4_first_render_job(vc4);
378
379 if (!exec)
380 return;
381
382 vc4->finished_seqno++;
383 list_move_tail(&exec->head, &vc4->job_done_list);
384 - vc4_submit_next_job(dev);
385 + vc4_submit_next_render_job(dev);
386
387 wake_up_all(&vc4->job_wait_queue);
388 schedule_work(&vc4->job_done_work);
389 @@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg)
390 barrier();
391 intctl = V3D_READ(V3D_INTCTL);
392
393 - /* Acknowledge the interrupts we're handling here. The render
394 - * frame done interrupt will be cleared, while OUTOMEM will
395 - * stay high until the underlying cause is cleared.
396 + /* Acknowledge the interrupts we're handling here. The binner
397 + * last flush / render frame done interrupt will be cleared,
398 + * while OUTOMEM will stay high until the underlying cause is
399 + * cleared.
400 */
401 V3D_WRITE(V3D_INTCTL, intctl);
402
403 @@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg)
404 status = IRQ_HANDLED;
405 }
406
407 + if (intctl & V3D_INT_FLDONE) {
408 + spin_lock(&vc4->job_lock);
409 + vc4_irq_finish_bin_job(dev);
410 + spin_unlock(&vc4->job_lock);
411 + status = IRQ_HANDLED;
412 + }
413 +
414 if (intctl & V3D_INT_FRDONE) {
415 spin_lock(&vc4->job_lock);
416 - vc4_irq_finish_job(dev);
417 + vc4_irq_finish_render_job(dev);
418 spin_unlock(&vc4->job_lock);
419 status = IRQ_HANDLED;
420 }
421 @@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de
422 V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
423
424 spin_lock_irqsave(&vc4->job_lock, irqflags);
425 - vc4_irq_finish_job(dev);
426 + vc4_cancel_bin_job(dev);
427 + vc4_irq_finish_render_job(dev);
428 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
429 }