1 From 22dbf1420a552d1952d22b92d8c30f8162b026b5 Mon Sep 17 00:00:00 2001
2 From: Eric Anholt <eric@anholt.net>
3 Date: Tue, 16 Apr 2019 15:58:54 -0700
4 Subject: [PATCH] drm/v3d: Add support for compute shader dispatch.
6 The compute shader dispatch interface is pretty simple -- just pass in
7 the regs that userspace has passed us, with no CLs to run. However,
8 with no CL to run it means that we need to do manual cache flushing of
9 the L2 after the HW execution completes (for SSBO, atomic, and
10 image_load_store writes that are the output of compute shaders).
12 This doesn't yet expose the L2 cache's ability to have a region of the
13 address space not write back to memory (which could be used for
16 So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing
17 the ES31 tests), and on the kernel side on 7278 (failing atomic
18 compswap tests in a way that doesn't reproduce on simpenrose).
20 v2: Fix excessive allocation for the clean_job (reported by Dan
21 Carpenter). Keep refs on jobs until clean_job is finished, to
22 avoid spurious MMU errors if the output BOs are freed by userspace
23 before L2 cleaning is finished.
25 Signed-off-by: Eric Anholt <eric@anholt.net>
26 Link: https://patchwork.freedesktop.org/patch/msgid/20190416225856.20264-4-eric@anholt.net
27 Acked-by: Rob Clark <robdclark@gmail.com>
29 drivers/gpu/drm/v3d/v3d_debugfs.c | 22 +++++
30 drivers/gpu/drm/v3d/v3d_drv.c | 10 +-
31 drivers/gpu/drm/v3d/v3d_drv.h | 28 +++++-
32 drivers/gpu/drm/v3d/v3d_fence.c | 2 +
33 drivers/gpu/drm/v3d/v3d_gem.c | 156 +++++++++++++++++++++++++++++-
34 drivers/gpu/drm/v3d/v3d_irq.c | 16 ++-
35 drivers/gpu/drm/v3d/v3d_regs.h | 73 ++++++++++++++
36 drivers/gpu/drm/v3d/v3d_sched.c | 121 +++++++++++++++++++++--
37 drivers/gpu/drm/v3d/v3d_trace.h | 94 ++++++++++++++++++
38 include/uapi/drm/v3d_drm.h | 28 ++++++
39 10 files changed, 531 insertions(+), 19 deletions(-)
41 --- a/drivers/gpu/drm/v3d/v3d_debugfs.c
42 +++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
43 @@ -57,6 +57,17 @@ static const struct v3d_reg_def v3d_core
44 REGDEF(V3D_GMP_VIO_ADDR),
47 +static const struct v3d_reg_def v3d_csd_reg_defs[] = {
48 + REGDEF(V3D_CSD_STATUS),
49 + REGDEF(V3D_CSD_CURRENT_CFG0),
50 + REGDEF(V3D_CSD_CURRENT_CFG1),
51 + REGDEF(V3D_CSD_CURRENT_CFG2),
52 + REGDEF(V3D_CSD_CURRENT_CFG3),
53 + REGDEF(V3D_CSD_CURRENT_CFG4),
54 + REGDEF(V3D_CSD_CURRENT_CFG5),
55 + REGDEF(V3D_CSD_CURRENT_CFG6),
58 static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused)
60 struct drm_info_node *node = (struct drm_info_node *)m->private;
61 @@ -88,6 +99,17 @@ static int v3d_v3d_debugfs_regs(struct s
63 v3d_core_reg_defs[i].reg));
66 + if (v3d_has_csd(v3d)) {
67 + for (i = 0; i < ARRAY_SIZE(v3d_csd_reg_defs); i++) {
68 + seq_printf(m, "core %d %s (0x%04x): 0x%08x\n",
70 + v3d_csd_reg_defs[i].name,
71 + v3d_csd_reg_defs[i].reg,
73 + v3d_csd_reg_defs[i].reg));
79 --- a/drivers/gpu/drm/v3d/v3d_drv.c
80 +++ b/drivers/gpu/drm/v3d/v3d_drv.c
82 * This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs.
83 * For V3D 2.x support, see the VC4 driver.
85 - * Currently only single-core rendering using the binner and renderer,
86 - * along with TFU (texture formatting unit) rendering is supported.
87 - * V3D 4.x's CSD (compute shader dispatch) is not yet supported.
88 + * The V3D GPU includes a tiled render (composed of a bin and render
89 + * pipelines), the TFU (texture formatting unit), and the CSD (compute
93 #include <linux/clk.h>
94 @@ -114,6 +114,9 @@ static int v3d_get_param_ioctl(struct dr
95 case DRM_V3D_PARAM_SUPPORTS_TFU:
98 + case DRM_V3D_PARAM_SUPPORTS_CSD:
99 + args->value = v3d_has_csd(v3d);
102 DRM_DEBUG("Unknown parameter %d\n", args->param);
104 @@ -183,6 +186,7 @@ static const struct drm_ioctl_desc v3d_d
105 DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
106 DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
107 DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
108 + DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
111 static const struct vm_operations_struct v3d_vm_ops = {
112 --- a/drivers/gpu/drm/v3d/v3d_drv.h
113 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
114 @@ -16,9 +16,11 @@ enum v3d_queue {
122 -#define V3D_MAX_QUEUES (V3D_TFU + 1)
123 +#define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)
125 struct v3d_queue_state {
126 struct drm_gpu_scheduler sched;
127 @@ -70,6 +72,7 @@ struct v3d_dev {
128 struct v3d_bin_job *bin_job;
129 struct v3d_render_job *render_job;
130 struct v3d_tfu_job *tfu_job;
131 + struct v3d_csd_job *csd_job;
133 struct v3d_queue_state queue[V3D_MAX_QUEUES];
135 @@ -92,6 +95,12 @@ struct v3d_dev {
137 struct mutex sched_lock;
139 + /* Lock taken during a cache clean and when initiating an L2
140 + * flush, to keep L2 flushes from interfering with the
141 + * synchronous L2 cleans.
143 + struct mutex cache_clean_lock;
148 @@ -104,6 +113,12 @@ to_v3d_dev(struct drm_device *dev)
149 return (struct v3d_dev *)dev->dev_private;
153 +v3d_has_csd(struct v3d_dev *v3d)
155 + return v3d->ver >= 41;
158 /* The per-fd struct, which tracks the MMU mappings. */
159 struct v3d_file_priv {
161 @@ -237,6 +252,14 @@ struct v3d_tfu_job {
162 struct drm_v3d_submit_tfu args;
165 +struct v3d_csd_job {
166 + struct v3d_job base;
168 + u32 timedout_batches;
170 + struct drm_v3d_submit_csd args;
174 * _wait_for - magic (register) wait macro
176 @@ -302,11 +325,14 @@ int v3d_submit_cl_ioctl(struct drm_devic
177 struct drm_file *file_priv);
178 int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
179 struct drm_file *file_priv);
180 +int v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
181 + struct drm_file *file_priv);
182 int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
183 struct drm_file *file_priv);
184 void v3d_job_put(struct v3d_job *job);
185 void v3d_reset(struct v3d_dev *v3d);
186 void v3d_invalidate_caches(struct v3d_dev *v3d);
187 +void v3d_clean_caches(struct v3d_dev *v3d);
190 int v3d_irq_init(struct v3d_dev *v3d);
191 --- a/drivers/gpu/drm/v3d/v3d_fence.c
192 +++ b/drivers/gpu/drm/v3d/v3d_fence.c
193 @@ -36,6 +36,8 @@ static const char *v3d_fence_get_timelin
202 --- a/drivers/gpu/drm/v3d/v3d_gem.c
203 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
204 @@ -162,10 +162,52 @@ v3d_flush_l2t(struct v3d_dev *v3d, int c
205 /* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't
206 * need to wait for completion before dispatching the job --
207 * L2T accesses will be stalled until the flush has completed.
208 + * However, we do need to make sure we don't try to trigger a
209 + * new flush while the L2_CLEAN queue is trying to
210 + * synchronously clean after a job.
212 + mutex_lock(&v3d->cache_clean_lock);
213 V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
214 V3D_L2TCACTL_L2TFLS |
215 V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
216 + mutex_unlock(&v3d->cache_clean_lock);
219 +/* Cleans texture L1 and L2 cachelines (writing back dirty data).
221 + * For cleaning, which happens from the CACHE_CLEAN queue after CSD has
222 + * executed, we need to make sure that the clean is done before
223 + * signaling job completion. So, we synchronously wait before
224 + * returning, and we make sure that L2 invalidates don't happen in the
225 + * meantime to confuse our are-we-done checks.
228 +v3d_clean_caches(struct v3d_dev *v3d)
230 + struct drm_device *dev = &v3d->drm;
233 + trace_v3d_cache_clean_begin(dev);
235 + V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF);
236 + if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
237 + V3D_L2TCACTL_L2TFLS), 100)) {
238 + DRM_ERROR("Timeout waiting for L1T write combiner flush\n");
241 + mutex_lock(&v3d->cache_clean_lock);
242 + V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
243 + V3D_L2TCACTL_L2TFLS |
244 + V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAN, V3D_L2TCACTL_FLM));
246 + if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
247 + V3D_L2TCACTL_L2TFLS), 100)) {
248 + DRM_ERROR("Timeout waiting for L2T clean\n");
251 + mutex_unlock(&v3d->cache_clean_lock);
253 + trace_v3d_cache_clean_end(dev);
256 /* Invalidates the slice caches. These are read-only caches. */
257 @@ -584,7 +626,8 @@ static void
258 v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv,
260 struct ww_acquire_ctx *acquire_ctx,
263 + struct dma_fence *done_fence)
265 struct drm_syncobj *sync_out;
267 @@ -594,7 +637,7 @@ v3d_attach_fences_and_unlock_reservation
268 /* Update the return sync object for the job */
269 sync_out = drm_syncobj_find(file_priv, out_sync);
271 - drm_syncobj_replace_fence(sync_out, job->done_fence);
272 + drm_syncobj_replace_fence(sync_out, done_fence);
273 drm_syncobj_put(sync_out);
276 @@ -691,8 +734,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
277 mutex_unlock(&v3d->sched_lock);
279 v3d_attach_fences_and_unlock_reservation(file_priv,
280 - &render->base, &acquire_ctx,
285 + render->base.done_fence);
288 v3d_job_put(&bin->base);
289 @@ -785,7 +830,8 @@ v3d_submit_tfu_ioctl(struct drm_device *
291 v3d_attach_fences_and_unlock_reservation(file_priv,
292 &job->base, &acquire_ctx,
295 + job->base.done_fence);
297 v3d_job_put(&job->base);
299 @@ -801,6 +847,105 @@ fail:
304 + * v3d_submit_csd_ioctl() - Submits a CSD (texture formatting) job to the V3D.
306 + * @data: ioctl argument
307 + * @file_priv: DRM file for this fd
309 + * Userspace provides the register setup for the CSD, which we don't
310 + * need to validate since the CSD is behind the MMU.
313 +v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
314 + struct drm_file *file_priv)
316 + struct v3d_dev *v3d = to_v3d_dev(dev);
317 + struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
318 + struct drm_v3d_submit_csd *args = data;
319 + struct v3d_csd_job *job;
320 + struct v3d_job *clean_job;
321 + struct ww_acquire_ctx acquire_ctx;
324 + trace_v3d_submit_csd_ioctl(&v3d->drm, args->cfg[5], args->cfg[6]);
326 + if (!v3d_has_csd(v3d)) {
327 + DRM_DEBUG("Attempting CSD submit on non-CSD hardware\n");
331 + job = kcalloc(1, sizeof(*job), GFP_KERNEL);
335 + ret = v3d_job_init(v3d, file_priv, &job->base,
336 + v3d_job_free, args->in_sync);
342 + clean_job = kcalloc(1, sizeof(*clean_job), GFP_KERNEL);
344 + v3d_job_put(&job->base);
349 + ret = v3d_job_init(v3d, file_priv, clean_job, v3d_job_free, 0);
351 + v3d_job_put(&job->base);
358 + ret = v3d_lookup_bos(dev, file_priv, clean_job,
359 + args->bo_handles, args->bo_handle_count);
363 + ret = v3d_lock_bo_reservations(clean_job, &acquire_ctx);
367 + mutex_lock(&v3d->sched_lock);
368 + ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
370 + goto fail_unreserve;
372 + ret = v3d_add_dep(clean_job, dma_fence_get(job->base.done_fence));
374 + goto fail_unreserve;
375 + ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
377 + goto fail_unreserve;
378 + mutex_unlock(&v3d->sched_lock);
380 + v3d_attach_fences_and_unlock_reservation(file_priv,
384 + clean_job->done_fence);
386 + v3d_job_put(&job->base);
387 + v3d_job_put(clean_job);
392 + mutex_unlock(&v3d->sched_lock);
393 + v3d_unlock_bo_reservations(clean_job->bo, clean_job->bo_count,
396 + v3d_job_put(&job->base);
397 + v3d_job_put(clean_job);
403 v3d_gem_init(struct drm_device *dev)
405 @@ -816,6 +961,7 @@ v3d_gem_init(struct drm_device *dev)
406 mutex_init(&v3d->bo_lock);
407 mutex_init(&v3d->reset_lock);
408 mutex_init(&v3d->sched_lock);
409 + mutex_init(&v3d->cache_clean_lock);
411 /* Note: We don't allocate address 0. Various bits of HW
412 * treat 0 as special, such as the occlusion query counters
413 --- a/drivers/gpu/drm/v3d/v3d_irq.c
414 +++ b/drivers/gpu/drm/v3d/v3d_irq.c
417 * DOC: Interrupt management for the V3D engine
419 - * When we take a bin, render, or TFU done interrupt, we need to
420 - * signal the fence for that job so that the scheduler can queue up
421 - * the next one and unblock any waiters.
422 + * When we take a bin, render, TFU done, or CSD done interrupt, we
423 + * need to signal the fence for that job so that the scheduler can
424 + * queue up the next one and unblock any waiters.
426 * When we take the binner out of memory interrupt, we need to
427 * allocate some new memory and pass it to the binner so that the
429 #define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \
432 + V3D_INT_CSDDONE | \
435 #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \
436 @@ -108,6 +109,15 @@ v3d_irq(int irq, void *arg)
437 dma_fence_signal(&fence->base);
438 status = IRQ_HANDLED;
441 + if (intsts & V3D_INT_CSDDONE) {
442 + struct v3d_fence *fence =
443 + to_v3d_fence(v3d->csd_job->base.irq_fence);
445 + trace_v3d_csd_irq(&v3d->drm, fence->seqno);
446 + dma_fence_signal(&fence->base);
447 + status = IRQ_HANDLED;
450 /* We shouldn't be triggering these if we have GMP in
451 * always-allowed mode.
452 --- a/drivers/gpu/drm/v3d/v3d_regs.h
453 +++ b/drivers/gpu/drm/v3d/v3d_regs.h
455 #define V3D_CTL_L2TCACTL 0x00030
456 # define V3D_L2TCACTL_TMUWCF BIT(8)
457 # define V3D_L2TCACTL_L2T_NO_WM BIT(4)
458 +/* Invalidates cache lines. */
459 # define V3D_L2TCACTL_FLM_FLUSH 0
460 +/* Removes cachelines without writing dirty lines back. */
461 # define V3D_L2TCACTL_FLM_CLEAR 1
462 +/* Writes out dirty cachelines and marks them clean, but doesn't invalidate. */
463 # define V3D_L2TCACTL_FLM_CLEAN 2
464 # define V3D_L2TCACTL_FLM_MASK V3D_MASK(2, 1)
465 # define V3D_L2TCACTL_FLM_SHIFT 1
467 #define V3D_CTL_INT_MSK_CLR 0x00064
468 # define V3D_INT_QPU_MASK V3D_MASK(27, 16)
469 # define V3D_INT_QPU_SHIFT 16
470 +# define V3D_INT_CSDDONE BIT(7)
471 +# define V3D_INT_PCTR BIT(6)
472 # define V3D_INT_GMPV BIT(5)
473 # define V3D_INT_TRFB BIT(4)
474 # define V3D_INT_SPILLUSE BIT(3)
476 #define V3D_GMP_PRESERVE_LOAD 0x00818
477 #define V3D_GMP_VALID_LINES 0x00820
479 +#define V3D_CSD_STATUS 0x00900
480 +# define V3D_CSD_STATUS_NUM_COMPLETED_MASK V3D_MASK(11, 4)
481 +# define V3D_CSD_STATUS_NUM_COMPLETED_SHIFT 4
482 +# define V3D_CSD_STATUS_NUM_ACTIVE_MASK V3D_MASK(3, 2)
483 +# define V3D_CSD_STATUS_NUM_ACTIVE_SHIFT 2
484 +# define V3D_CSD_STATUS_HAVE_CURRENT_DISPATCH BIT(1)
485 +# define V3D_CSD_STATUS_HAVE_QUEUED_DISPATCH BIT(0)
487 +#define V3D_CSD_QUEUED_CFG0 0x00904
488 +# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_MASK V3D_MASK(31, 16)
489 +# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_SHIFT 16
490 +# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_MASK V3D_MASK(15, 0)
491 +# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_SHIFT 0
493 +#define V3D_CSD_QUEUED_CFG1 0x00908
494 +# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_MASK V3D_MASK(31, 16)
495 +# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_SHIFT 16
496 +# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_MASK V3D_MASK(15, 0)
497 +# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_SHIFT 0
499 +#define V3D_CSD_QUEUED_CFG2 0x0090c
500 +# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_MASK V3D_MASK(31, 16)
501 +# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_SHIFT 16
502 +# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_MASK V3D_MASK(15, 0)
503 +# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_SHIFT 0
505 +#define V3D_CSD_QUEUED_CFG3 0x00910
506 +# define V3D_CSD_QUEUED_CFG3_OVERLAP_WITH_PREV BIT(26)
507 +# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_MASK V3D_MASK(25, 20)
508 +# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_SHIFT 20
509 +# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_MASK V3D_MASK(19, 12)
510 +# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_SHIFT 12
511 +# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_MASK V3D_MASK(11, 8)
512 +# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_SHIFT 8
513 +# define V3D_CSD_QUEUED_CFG3_WG_SIZE_MASK V3D_MASK(7, 0)
514 +# define V3D_CSD_QUEUED_CFG3_WG_SIZE_SHIFT 0
516 +/* Number of batches, minus 1 */
517 +#define V3D_CSD_QUEUED_CFG4 0x00914
519 +/* Shader address, pnan, singleseg, threading, like a shader record. */
520 +#define V3D_CSD_QUEUED_CFG5 0x00918
522 +/* Uniforms address (4 byte aligned) */
523 +#define V3D_CSD_QUEUED_CFG6 0x0091c
525 +#define V3D_CSD_CURRENT_CFG0 0x00920
526 +#define V3D_CSD_CURRENT_CFG1 0x00924
527 +#define V3D_CSD_CURRENT_CFG2 0x00928
528 +#define V3D_CSD_CURRENT_CFG3 0x0092c
529 +#define V3D_CSD_CURRENT_CFG4 0x00930
530 +#define V3D_CSD_CURRENT_CFG5 0x00934
531 +#define V3D_CSD_CURRENT_CFG6 0x00938
533 +#define V3D_CSD_CURRENT_ID0 0x0093c
534 +# define V3D_CSD_CURRENT_ID0_WG_X_MASK V3D_MASK(31, 16)
535 +# define V3D_CSD_CURRENT_ID0_WG_X_SHIFT 16
536 +# define V3D_CSD_CURRENT_ID0_WG_IN_SG_MASK V3D_MASK(11, 8)
537 +# define V3D_CSD_CURRENT_ID0_WG_IN_SG_SHIFT 8
538 +# define V3D_CSD_CURRENT_ID0_L_IDX_MASK V3D_MASK(7, 0)
539 +# define V3D_CSD_CURRENT_ID0_L_IDX_SHIFT 0
541 +#define V3D_CSD_CURRENT_ID1 0x00940
542 +# define V3D_CSD_CURRENT_ID0_WG_Z_MASK V3D_MASK(31, 16)
543 +# define V3D_CSD_CURRENT_ID0_WG_Z_SHIFT 16
544 +# define V3D_CSD_CURRENT_ID0_WG_Y_MASK V3D_MASK(15, 0)
545 +# define V3D_CSD_CURRENT_ID0_WG_Y_SHIFT 0
547 #endif /* V3D_REGS_H */
548 --- a/drivers/gpu/drm/v3d/v3d_sched.c
549 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
550 @@ -48,6 +48,12 @@ to_tfu_job(struct drm_sched_job *sched_j
551 return container_of(sched_job, struct v3d_tfu_job, base.base);
554 +static struct v3d_csd_job *
555 +to_csd_job(struct drm_sched_job *sched_job)
557 + return container_of(sched_job, struct v3d_csd_job, base.base);
561 v3d_job_free(struct drm_sched_job *sched_job)
563 @@ -205,6 +211,48 @@ v3d_tfu_job_run(struct drm_sched_job *sc
567 +static struct dma_fence *
568 +v3d_csd_job_run(struct drm_sched_job *sched_job)
570 + struct v3d_csd_job *job = to_csd_job(sched_job);
571 + struct v3d_dev *v3d = job->base.v3d;
572 + struct drm_device *dev = &v3d->drm;
573 + struct dma_fence *fence;
576 + v3d->csd_job = job;
578 + v3d_invalidate_caches(v3d);
580 + fence = v3d_fence_create(v3d, V3D_CSD);
584 + if (job->base.irq_fence)
585 + dma_fence_put(job->base.irq_fence);
586 + job->base.irq_fence = dma_fence_get(fence);
588 + trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
590 + for (i = 1; i <= 6; i++)
591 + V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
592 + /* CFG0 write kicks off the job. */
593 + V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]);
598 +static struct dma_fence *
599 +v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
601 + struct v3d_job *job = to_v3d_job(sched_job);
602 + struct v3d_dev *v3d = job->v3d;
604 + v3d_clean_caches(v3d);
610 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
612 @@ -277,13 +325,31 @@ v3d_render_job_timedout(struct drm_sched
616 -v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
617 +v3d_generic_job_timedout(struct drm_sched_job *sched_job)
619 struct v3d_job *job = to_v3d_job(sched_job);
621 v3d_gpu_reset_for_timeout(job->v3d, sched_job);
625 +v3d_csd_job_timedout(struct drm_sched_job *sched_job)
627 + struct v3d_csd_job *job = to_csd_job(sched_job);
628 + struct v3d_dev *v3d = job->base.v3d;
629 + u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4);
631 + /* If we've made progress, skip reset and let the timer get
634 + if (job->timedout_batches != batches) {
635 + job->timedout_batches = batches;
639 + v3d_gpu_reset_for_timeout(v3d, sched_job);
642 static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
643 .dependency = v3d_job_dependency,
644 .run_job = v3d_bin_job_run,
645 @@ -301,10 +367,24 @@ static const struct drm_sched_backend_op
646 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
647 .dependency = v3d_job_dependency,
648 .run_job = v3d_tfu_job_run,
649 - .timedout_job = v3d_tfu_job_timedout,
650 + .timedout_job = v3d_generic_job_timedout,
651 .free_job = v3d_job_free,
654 +static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
655 + .dependency = v3d_job_dependency,
656 + .run_job = v3d_csd_job_run,
657 + .timedout_job = v3d_csd_job_timedout,
658 + .free_job = v3d_job_free
661 +static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
662 + .dependency = v3d_job_dependency,
663 + .run_job = v3d_cache_clean_job_run,
664 + .timedout_job = v3d_generic_job_timedout,
665 + .free_job = v3d_job_free
669 v3d_sched_init(struct v3d_dev *v3d)
671 @@ -331,7 +411,7 @@ v3d_sched_init(struct v3d_dev *v3d)
673 dev_err(v3d->dev, "Failed to create render scheduler: %d.",
675 - drm_sched_fini(&v3d->queue[V3D_BIN].sched);
676 + v3d_sched_fini(v3d);
680 @@ -343,11 +423,36 @@ v3d_sched_init(struct v3d_dev *v3d)
682 dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
684 - drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
685 - drm_sched_fini(&v3d->queue[V3D_BIN].sched);
686 + v3d_sched_fini(v3d);
690 + if (v3d_has_csd(v3d)) {
691 + ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
692 + &v3d_csd_sched_ops,
693 + hw_jobs_limit, job_hang_limit,
694 + msecs_to_jiffies(hang_limit_ms),
697 + dev_err(v3d->dev, "Failed to create CSD scheduler: %d.",
699 + v3d_sched_fini(v3d);
703 + ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
704 + &v3d_cache_clean_sched_ops,
705 + hw_jobs_limit, job_hang_limit,
706 + msecs_to_jiffies(hang_limit_ms),
707 + "v3d_cache_clean");
709 + dev_err(v3d->dev, "Failed to create CACHE_CLEAN scheduler: %d.",
711 + v3d_sched_fini(v3d);
719 @@ -356,6 +461,8 @@ v3d_sched_fini(struct v3d_dev *v3d)
723 - for (q = 0; q < V3D_MAX_QUEUES; q++)
724 - drm_sched_fini(&v3d->queue[q].sched);
725 + for (q = 0; q < V3D_MAX_QUEUES; q++) {
726 + if (v3d->queue[q].sched.ops)
727 + drm_sched_fini(&v3d->queue[q].sched);
730 --- a/drivers/gpu/drm/v3d/v3d_trace.h
731 +++ b/drivers/gpu/drm/v3d/v3d_trace.h
732 @@ -124,6 +124,26 @@ TRACE_EVENT(v3d_tfu_irq,
736 +TRACE_EVENT(v3d_csd_irq,
737 + TP_PROTO(struct drm_device *dev,
739 + TP_ARGS(dev, seqno),
743 + __field(u64, seqno)
747 + __entry->dev = dev->primary->index;
748 + __entry->seqno = seqno;
751 + TP_printk("dev=%u, seqno=%llu",
756 TRACE_EVENT(v3d_submit_tfu_ioctl,
757 TP_PROTO(struct drm_device *dev, u32 iia),
759 @@ -163,6 +183,80 @@ TRACE_EVENT(v3d_submit_tfu,
763 +TRACE_EVENT(v3d_submit_csd_ioctl,
764 + TP_PROTO(struct drm_device *dev, u32 cfg5, u32 cfg6),
765 + TP_ARGS(dev, cfg5, cfg6),
774 + __entry->dev = dev->primary->index;
775 + __entry->cfg5 = cfg5;
776 + __entry->cfg6 = cfg6;
779 + TP_printk("dev=%u, CFG5 0x%08x, CFG6 0x%08x",
785 +TRACE_EVENT(v3d_submit_csd,
786 + TP_PROTO(struct drm_device *dev,
788 + TP_ARGS(dev, seqno),
792 + __field(u64, seqno)
796 + __entry->dev = dev->primary->index;
797 + __entry->seqno = seqno;
800 + TP_printk("dev=%u, seqno=%llu",
805 +TRACE_EVENT(v3d_cache_clean_begin,
806 + TP_PROTO(struct drm_device *dev),
814 + __entry->dev = dev->primary->index;
817 + TP_printk("dev=%u",
821 +TRACE_EVENT(v3d_cache_clean_end,
822 + TP_PROTO(struct drm_device *dev),
830 + __entry->dev = dev->primary->index;
833 + TP_printk("dev=%u",
837 TRACE_EVENT(v3d_reset_begin,
838 TP_PROTO(struct drm_device *dev),
840 --- a/include/uapi/drm/v3d_drm.h
841 +++ b/include/uapi/drm/v3d_drm.h
842 @@ -37,6 +37,7 @@ extern "C" {
843 #define DRM_V3D_GET_PARAM 0x04
844 #define DRM_V3D_GET_BO_OFFSET 0x05
845 #define DRM_V3D_SUBMIT_TFU 0x06
846 +#define DRM_V3D_SUBMIT_CSD 0x07
848 #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
849 #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
850 @@ -45,6 +46,7 @@ extern "C" {
851 #define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
852 #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
853 #define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
854 +#define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
857 * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
858 @@ -172,6 +174,7 @@ enum drm_v3d_param {
859 DRM_V3D_PARAM_V3D_CORE0_IDENT1,
860 DRM_V3D_PARAM_V3D_CORE0_IDENT2,
861 DRM_V3D_PARAM_SUPPORTS_TFU,
862 + DRM_V3D_PARAM_SUPPORTS_CSD,
865 struct drm_v3d_get_param {
866 @@ -212,6 +215,31 @@ struct drm_v3d_submit_tfu {
870 +/* Submits a compute shader for dispatch. This job will block on any
871 + * previous compute shaders submitted on this fd, and any other
872 + * synchronization must be performed with in_sync/out_sync.
874 +struct drm_v3d_submit_csd {
878 + /* Pointer to a u32 array of the BOs that are referenced by the job.
882 + /* Number of BO handles passed in (size is that times 4). */
883 + __u32 bo_handle_count;
885 + /* sync object to block on before running the CSD job. Each
886 + * CSD job will execute in the order submitted to its FD.
887 + * Synchronization against rendering/TFU jobs or CSD from
888 + * other fds requires using sync objects.
891 + /* Sync object to signal when the CSD job is done. */
895 #if defined(__cplusplus)