brcm2708: update to latest patches from RPi Foundation
[openwrt/staging/dedeckeh.git] / target / linux / brcm2708 / patches-4.19 / 950-0516-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch
diff --git a/target/linux/brcm2708/patches-4.19/950-0516-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch b/target/linux/brcm2708/patches-4.19/950-0516-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch
deleted file mode 100644 (file)
index 99b1a69..0000000
+++ /dev/null
@@ -1,802 +0,0 @@
-From ba1e90b6c3b3bf0e88ab01c824c4f8fde582e878 Mon Sep 17 00:00:00 2001
-From: Eric Anholt <eric@anholt.net>
-Date: Wed, 28 Nov 2018 15:09:25 -0800
-Subject: [PATCH] drm/v3d: Add support for submitting jobs to the TFU.
-
-The TFU can copy from raster, UIF, and SAND input images to UIF output
-images, with optional mipmap generation.  This will certainly be
-useful for media EGL image input, but is also useful immediately for
-mipmap generation without bogging the V3D core down.
-
-For now we only run the queue 1 job deep, and don't have any hang
-recovery (though I don't think we should need it, with TFU).  Queuing
-multiple jobs in the HW will require synchronizing the YUV coefficient
-regs updates since they don't get FIFOed with the job.
-
-v2: Change the ioctl to IOW instead of IOWR, always set COEF0, explain
-    why TFU is AUTH, clarify the syncing docs, drop the unused TFU
-    interrupt regs (you're expected to use the hub's), don't take
-    &bo->base for NULL bos.
-v3: Fix a little whitespace alignment (noticed by checkpatch), rebase
-    on drm_sched_job_cleanup() changes.
-
-Signed-off-by: Eric Anholt <eric@anholt.net>
-Reviewed-by: Dave Emett <david.emett@broadcom.com> (v2)
-Link: https://patchwork.freedesktop.org/patch/264607/
-(cherry picked from commit 1584f16ca96ef124aad79efa3303cff5f3530e2c)
----
- drivers/gpu/drm/v3d/v3d_drv.c   |  15 ++-
- drivers/gpu/drm/v3d/v3d_drv.h   |  32 +++++-
- drivers/gpu/drm/v3d/v3d_gem.c   | 178 ++++++++++++++++++++++++++++----
- drivers/gpu/drm/v3d/v3d_irq.c   |  12 ++-
- drivers/gpu/drm/v3d/v3d_regs.h  |  49 +++++++++
- drivers/gpu/drm/v3d/v3d_sched.c | 148 ++++++++++++++++++++++----
- drivers/gpu/drm/v3d/v3d_trace.h |  20 ++++
- include/uapi/drm/v3d_drm.h      |  25 +++++
- 8 files changed, 426 insertions(+), 53 deletions(-)
-
---- a/drivers/gpu/drm/v3d/v3d_drv.c
-+++ b/drivers/gpu/drm/v3d/v3d_drv.c
-@@ -112,10 +112,15 @@ static int v3d_get_param_ioctl(struct dr
-               return 0;
-       }
--      /* Any params that aren't just register reads would go here. */
--      DRM_DEBUG("Unknown parameter %d\n", args->param);
--      return -EINVAL;
-+      switch (args->param) {
-+      case DRM_V3D_PARAM_SUPPORTS_TFU:
-+              args->value = 1;
-+              return 0;
-+      default:
-+              DRM_DEBUG("Unknown parameter %d\n", args->param);
-+              return -EINVAL;
-+      }
- }
- static int
-@@ -170,7 +175,8 @@ static const struct file_operations v3d_
- /* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
-  * protection between clients.  Note that render nodes would be be
-  * able to submit CLs that could access BOs from clients authenticated
-- * with the master node.
-+ * with the master node.  The TFU doesn't use the GMP, so it would
-+ * need to stay DRM_AUTH until we do buffer size/offset validation.
-  */
- static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
-       DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
-@@ -179,6 +185,7 @@ static const struct drm_ioctl_desc v3d_d
-       DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
-       DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
-       DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
-+      DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
- };
- static const struct vm_operations_struct v3d_vm_ops = {
---- a/drivers/gpu/drm/v3d/v3d_drv.h
-+++ b/drivers/gpu/drm/v3d/v3d_drv.h
-@@ -7,19 +7,18 @@
- #include <drm/drm_encoder.h>
- #include <drm/drm_gem.h>
- #include <drm/gpu_scheduler.h>
-+#include "uapi/drm/v3d_drm.h"
- #define GMP_GRANULARITY (128 * 1024)
--/* Enum for each of the V3D queues.  We maintain various queue
-- * tracking as an array because at some point we'll want to support
-- * the TFU (texture formatting unit) as another queue.
-- */
-+/* Enum for each of the V3D queues. */
- enum v3d_queue {
-       V3D_BIN,
-       V3D_RENDER,
-+      V3D_TFU,
- };
--#define V3D_MAX_QUEUES (V3D_RENDER + 1)
-+#define V3D_MAX_QUEUES (V3D_TFU + 1)
- struct v3d_queue_state {
-       struct drm_gpu_scheduler sched;
-@@ -68,6 +67,7 @@ struct v3d_dev {
-       struct v3d_exec_info *bin_job;
-       struct v3d_exec_info *render_job;
-+      struct v3d_tfu_job *tfu_job;
-       struct v3d_queue_state queue[V3D_MAX_QUEUES];
-@@ -218,6 +218,25 @@ struct v3d_exec_info {
-       u32 qma, qms, qts;
- };
-+struct v3d_tfu_job {
-+      struct drm_sched_job base;
-+
-+      struct drm_v3d_submit_tfu args;
-+
-+      /* An optional fence userspace can pass in for the job to depend on. */
-+      struct dma_fence *in_fence;
-+
-+      /* v3d fence to be signaled by IRQ handler when the job is complete. */
-+      struct dma_fence *done_fence;
-+
-+      struct v3d_dev *v3d;
-+
-+      struct kref refcount;
-+
-+      /* This is the array of BOs that were looked up at the start of exec. */
-+      struct v3d_bo *bo[4];
-+};
-+
- /**
-  * _wait_for - magic (register) wait macro
-  *
-@@ -281,9 +300,12 @@ int v3d_gem_init(struct drm_device *dev)
- void v3d_gem_destroy(struct drm_device *dev);
- int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
-                       struct drm_file *file_priv);
-+int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
-+                       struct drm_file *file_priv);
- int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
-                     struct drm_file *file_priv);
- void v3d_exec_put(struct v3d_exec_info *exec);
-+void v3d_tfu_job_put(struct v3d_tfu_job *exec);
- void v3d_reset(struct v3d_dev *v3d);
- void v3d_invalidate_caches(struct v3d_dev *v3d);
- void v3d_flush_caches(struct v3d_dev *v3d);
---- a/drivers/gpu/drm/v3d/v3d_gem.c
-+++ b/drivers/gpu/drm/v3d/v3d_gem.c
-@@ -207,26 +207,27 @@ v3d_flush_caches(struct v3d_dev *v3d)
- }
- static void
--v3d_attach_object_fences(struct v3d_exec_info *exec)
-+v3d_attach_object_fences(struct v3d_bo **bos, int bo_count,
-+                       struct dma_fence *fence)
- {
--      struct dma_fence *out_fence = exec->render_done_fence;
-       int i;
--      for (i = 0; i < exec->bo_count; i++) {
-+      for (i = 0; i < bo_count; i++) {
-               /* XXX: Use shared fences for read-only objects. */
--              reservation_object_add_excl_fence(exec->bo[i]->resv, out_fence);
-+              reservation_object_add_excl_fence(bos[i]->resv, fence);
-       }
- }
- static void
- v3d_unlock_bo_reservations(struct drm_device *dev,
--                         struct v3d_exec_info *exec,
-+                         struct v3d_bo **bos,
-+                         int bo_count,
-                          struct ww_acquire_ctx *acquire_ctx)
- {
-       int i;
--      for (i = 0; i < exec->bo_count; i++)
--              ww_mutex_unlock(&exec->bo[i]->resv->lock);
-+      for (i = 0; i < bo_count; i++)
-+              ww_mutex_unlock(&bos[i]->resv->lock);
-       ww_acquire_fini(acquire_ctx);
- }
-@@ -240,7 +241,8 @@ v3d_unlock_bo_reservations(struct drm_de
-  */
- static int
- v3d_lock_bo_reservations(struct drm_device *dev,
--                       struct v3d_exec_info *exec,
-+                       struct v3d_bo **bos,
-+                       int bo_count,
-                        struct ww_acquire_ctx *acquire_ctx)
- {
-       int contended_lock = -1;
-@@ -250,7 +252,7 @@ v3d_lock_bo_reservations(struct drm_devi
- retry:
-       if (contended_lock != -1) {
--              struct v3d_bo *bo = exec->bo[contended_lock];
-+              struct v3d_bo *bo = bos[contended_lock];
-               ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
-                                                      acquire_ctx);
-@@ -260,20 +262,20 @@ retry:
-               }
-       }
--      for (i = 0; i < exec->bo_count; i++) {
-+      for (i = 0; i < bo_count; i++) {
-               if (i == contended_lock)
-                       continue;
--              ret = ww_mutex_lock_interruptible(&exec->bo[i]->resv->lock,
-+              ret = ww_mutex_lock_interruptible(&bos[i]->resv->lock,
-                                                 acquire_ctx);
-               if (ret) {
-                       int j;
-                       for (j = 0; j < i; j++)
--                              ww_mutex_unlock(&exec->bo[j]->resv->lock);
-+                              ww_mutex_unlock(&bos[j]->resv->lock);
-                       if (contended_lock != -1 && contended_lock >= i) {
--                              struct v3d_bo *bo = exec->bo[contended_lock];
-+                              struct v3d_bo *bo = bos[contended_lock];
-                               ww_mutex_unlock(&bo->resv->lock);
-                       }
-@@ -293,10 +295,11 @@ retry:
-       /* Reserve space for our shared (read-only) fence references,
-        * before we commit the CL to the hardware.
-        */
--      for (i = 0; i < exec->bo_count; i++) {
--              ret = reservation_object_reserve_shared(exec->bo[i]->resv);
-+      for (i = 0; i < bo_count; i++) {
-+              ret = reservation_object_reserve_shared(bos[i]->resv);
-               if (ret) {
--                      v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
-+                      v3d_unlock_bo_reservations(dev, bos, bo_count,
-+                                                 acquire_ctx);
-                       return ret;
-               }
-       }
-@@ -419,6 +422,33 @@ void v3d_exec_put(struct v3d_exec_info *
-       kref_put(&exec->refcount, v3d_exec_cleanup);
- }
-+static void
-+v3d_tfu_job_cleanup(struct kref *ref)
-+{
-+      struct v3d_tfu_job *job = container_of(ref, struct v3d_tfu_job,
-+                                             refcount);
-+      struct v3d_dev *v3d = job->v3d;
-+      unsigned int i;
-+
-+      dma_fence_put(job->in_fence);
-+      dma_fence_put(job->done_fence);
-+
-+      for (i = 0; i < ARRAY_SIZE(job->bo); i++) {
-+              if (job->bo[i])
-+                      drm_gem_object_put_unlocked(&job->bo[i]->base);
-+      }
-+
-+      pm_runtime_mark_last_busy(v3d->dev);
-+      pm_runtime_put_autosuspend(v3d->dev);
-+
-+      kfree(job);
-+}
-+
-+void v3d_tfu_job_put(struct v3d_tfu_job *job)
-+{
-+      kref_put(&job->refcount, v3d_tfu_job_cleanup);
-+}
-+
- int
- v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
-                 struct drm_file *file_priv)
-@@ -536,7 +566,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
-       if (ret)
-               goto fail;
--      ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
-+      ret = v3d_lock_bo_reservations(dev, exec->bo, exec->bo_count,
-+                                     &acquire_ctx);
-       if (ret)
-               goto fail;
-@@ -570,9 +601,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
-                                 &v3d_priv->sched_entity[V3D_RENDER]);
-       mutex_unlock(&v3d->sched_lock);
--      v3d_attach_object_fences(exec);
-+      v3d_attach_object_fences(exec->bo, exec->bo_count,
-+                               exec->render_done_fence);
--      v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
-+      v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
-       /* Update the return sync object for the */
-       sync_out = drm_syncobj_find(file_priv, args->out_sync);
-@@ -588,12 +620,118 @@ v3d_submit_cl_ioctl(struct drm_device *d
- fail_unreserve:
-       mutex_unlock(&v3d->sched_lock);
--      v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
-+      v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
- fail:
-       v3d_exec_put(exec);
-       return ret;
- }
-+
-+/**
-+ * v3d_submit_tfu_ioctl() - Submits a TFU (texture formatting) job to the V3D.
-+ * @dev: DRM device
-+ * @data: ioctl argument
-+ * @file_priv: DRM file for this fd
-+ *
-+ * Userspace provides the register setup for the TFU, which we don't
-+ * need to validate since the TFU is behind the MMU.
-+ */
-+int
-+v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
-+                   struct drm_file *file_priv)
-+{
-+      struct v3d_dev *v3d = to_v3d_dev(dev);
-+      struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
-+      struct drm_v3d_submit_tfu *args = data;
-+      struct v3d_tfu_job *job;
-+      struct ww_acquire_ctx acquire_ctx;
-+      struct drm_syncobj *sync_out;
-+      struct dma_fence *sched_done_fence;
-+      int ret = 0;
-+      int bo_count;
-+
-+      job = kcalloc(1, sizeof(*job), GFP_KERNEL);
-+      if (!job)
-+              return -ENOMEM;
-+
-+      ret = pm_runtime_get_sync(v3d->dev);
-+      if (ret < 0) {
-+              kfree(job);
-+              return ret;
-+      }
-+
-+      kref_init(&job->refcount);
-+
-+      ret = drm_syncobj_find_fence(file_priv, args->in_sync,
-+                                   0, &job->in_fence);
-+      if (ret == -EINVAL)
-+              goto fail;
-+
-+      job->args = *args;
-+      job->v3d = v3d;
-+
-+      spin_lock(&file_priv->table_lock);
-+      for (bo_count = 0; bo_count < ARRAY_SIZE(job->bo); bo_count++) {
-+              struct drm_gem_object *bo;
-+
-+              if (!args->bo_handles[bo_count])
-+                      break;
-+
-+              bo = idr_find(&file_priv->object_idr,
-+                            args->bo_handles[bo_count]);
-+              if (!bo) {
-+                      DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
-+                                bo_count, args->bo_handles[bo_count]);
-+                      ret = -ENOENT;
-+                      spin_unlock(&file_priv->table_lock);
-+                      goto fail;
-+              }
-+              drm_gem_object_get(bo);
-+              job->bo[bo_count] = to_v3d_bo(bo);
-+      }
-+      spin_unlock(&file_priv->table_lock);
-+
-+      ret = v3d_lock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
-+      if (ret)
-+              goto fail;
-+
-+      mutex_lock(&v3d->sched_lock);
-+      ret = drm_sched_job_init(&job->base,
-+                               &v3d_priv->sched_entity[V3D_TFU],
-+                               v3d_priv);
-+      if (ret)
-+              goto fail_unreserve;
-+
-+      sched_done_fence = dma_fence_get(&job->base.s_fence->finished);
-+
-+      kref_get(&job->refcount); /* put by scheduler job completion */
-+      drm_sched_entity_push_job(&job->base, &v3d_priv->sched_entity[V3D_TFU]);
-+      mutex_unlock(&v3d->sched_lock);
-+
-+      v3d_attach_object_fences(job->bo, bo_count, sched_done_fence);
-+
-+      v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
-+
-+      /* Update the return sync object */
-+      sync_out = drm_syncobj_find(file_priv, args->out_sync);
-+      if (sync_out) {
-+              drm_syncobj_replace_fence(sync_out, sched_done_fence);
-+              drm_syncobj_put(sync_out);
-+      }
-+      dma_fence_put(sched_done_fence);
-+
-+      v3d_tfu_job_put(job);
-+
-+      return 0;
-+
-+fail_unreserve:
-+      mutex_unlock(&v3d->sched_lock);
-+      v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
-+fail:
-+      v3d_tfu_job_put(job);
-+
-+      return ret;
-+}
- int
- v3d_gem_init(struct drm_device *dev)
---- a/drivers/gpu/drm/v3d/v3d_irq.c
-+++ b/drivers/gpu/drm/v3d/v3d_irq.c
-@@ -4,8 +4,8 @@
- /**
-  * DOC: Interrupt management for the V3D engine
-  *
-- * When we take a binning or rendering flush done interrupt, we need
-- * to signal the fence for that job so that the scheduler can queue up
-+ * When we take a bin, render, or TFU done interrupt, we need to
-+ * signal the fence for that job so that the scheduler can queue up
-  * the next one and unblock any waiters.
-  *
-  * When we take the binner out of memory interrupt, we need to
-@@ -23,7 +23,8 @@
- #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV |     \
-                           V3D_HUB_INT_MMU_PTI |       \
--                          V3D_HUB_INT_MMU_CAP))
-+                          V3D_HUB_INT_MMU_CAP |       \
-+                          V3D_HUB_INT_TFUC))
- static void
- v3d_overflow_mem_work(struct work_struct *work)
-@@ -117,6 +118,11 @@ v3d_hub_irq(int irq, void *arg)
-       /* Acknowledge the interrupts we're handling here. */
-       V3D_WRITE(V3D_HUB_INT_CLR, intsts);
-+      if (intsts & V3D_HUB_INT_TFUC) {
-+              dma_fence_signal(v3d->tfu_job->done_fence);
-+              status = IRQ_HANDLED;
-+      }
-+
-       if (intsts & (V3D_HUB_INT_MMU_WRV |
-                     V3D_HUB_INT_MMU_PTI |
-                     V3D_HUB_INT_MMU_CAP)) {
---- a/drivers/gpu/drm/v3d/v3d_regs.h
-+++ b/drivers/gpu/drm/v3d/v3d_regs.h
-@@ -86,6 +86,55 @@
- # define V3D_TOP_GR_BRIDGE_SW_INIT_1                   0x0000c
- # define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
-+#define V3D_TFU_CS                                     0x00400
-+/* Stops current job, empties input fifo. */
-+# define V3D_TFU_CS_TFURST                             BIT(31)
-+# define V3D_TFU_CS_CVTCT_MASK                         V3D_MASK(23, 16)
-+# define V3D_TFU_CS_CVTCT_SHIFT                        16
-+# define V3D_TFU_CS_NFREE_MASK                         V3D_MASK(13, 8)
-+# define V3D_TFU_CS_NFREE_SHIFT                        8
-+# define V3D_TFU_CS_BUSY                               BIT(0)
-+
-+#define V3D_TFU_SU                                     0x00404
-+/* Interrupt when FINTTHR input slots are free (0 = disabled) */
-+# define V3D_TFU_SU_FINTTHR_MASK                       V3D_MASK(13, 8)
-+# define V3D_TFU_SU_FINTTHR_SHIFT                      8
-+/* Skips resetting the CRC at the start of CRC generation. */
-+# define V3D_TFU_SU_CRCCHAIN                           BIT(4)
-+/* skips writes, computes CRC of the image.  miplevels must be 0. */
-+# define V3D_TFU_SU_CRC                                BIT(3)
-+# define V3D_TFU_SU_THROTTLE_MASK                      V3D_MASK(1, 0)
-+# define V3D_TFU_SU_THROTTLE_SHIFT                     0
-+
-+#define V3D_TFU_ICFG                                   0x00408
-+/* Interrupt when the conversion is complete. */
-+# define V3D_TFU_ICFG_IOC                              BIT(0)
-+
-+/* Input Image Address */
-+#define V3D_TFU_IIA                                    0x0040c
-+/* Input Chroma Address */
-+#define V3D_TFU_ICA                                    0x00410
-+/* Input Image Stride */
-+#define V3D_TFU_IIS                                    0x00414
-+/* Input Image U-Plane Address */
-+#define V3D_TFU_IUA                                    0x00418
-+/* Output Image Address */
-+#define V3D_TFU_IOA                                    0x0041c
-+/* Image Output Size */
-+#define V3D_TFU_IOS                                    0x00420
-+/* TFU YUV Coefficient 0 */
-+#define V3D_TFU_COEF0                                  0x00424
-+/* Use these regs instead of the defaults. */
-+# define V3D_TFU_COEF0_USECOEF                         BIT(31)
-+/* TFU YUV Coefficient 1 */
-+#define V3D_TFU_COEF1                                  0x00428
-+/* TFU YUV Coefficient 2 */
-+#define V3D_TFU_COEF2                                  0x0042c
-+/* TFU YUV Coefficient 3 */
-+#define V3D_TFU_COEF3                                  0x00430
-+
-+#define V3D_TFU_CRC                                    0x00434
-+
- /* Per-MMU registers. */
- #define V3D_MMUC_CONTROL                               0x01000
---- a/drivers/gpu/drm/v3d/v3d_sched.c
-+++ b/drivers/gpu/drm/v3d/v3d_sched.c
-@@ -30,6 +30,12 @@ to_v3d_job(struct drm_sched_job *sched_j
-       return container_of(sched_job, struct v3d_job, base);
- }
-+static struct v3d_tfu_job *
-+to_tfu_job(struct drm_sched_job *sched_job)
-+{
-+      return container_of(sched_job, struct v3d_tfu_job, base);
-+}
-+
- static void
- v3d_job_free(struct drm_sched_job *sched_job)
- {
-@@ -38,6 +44,14 @@ v3d_job_free(struct drm_sched_job *sched
-       v3d_exec_put(job->exec);
- }
-+static void
-+v3d_tfu_job_free(struct drm_sched_job *sched_job)
-+{
-+      struct v3d_tfu_job *job = to_tfu_job(sched_job);
-+
-+      v3d_tfu_job_put(job);
-+}
-+
- /**
-  * Returns the fences that the bin or render job depends on, one by one.
-  * v3d_job_run() won't be called until all of them have been signaled.
-@@ -76,6 +90,27 @@ v3d_job_dependency(struct drm_sched_job
-       return fence;
- }
-+/**
-+ * Returns the fences that the TFU job depends on, one by one.
-+ * v3d_tfu_job_run() won't be called until all of them have been
-+ * signaled.
-+ */
-+static struct dma_fence *
-+v3d_tfu_job_dependency(struct drm_sched_job *sched_job,
-+                     struct drm_sched_entity *s_entity)
-+{
-+      struct v3d_tfu_job *job = to_tfu_job(sched_job);
-+      struct dma_fence *fence;
-+
-+      fence = job->in_fence;
-+      if (fence) {
-+              job->in_fence = NULL;
-+              return fence;
-+      }
-+
-+      return NULL;
-+}
-+
- static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
- {
-       struct v3d_job *job = to_v3d_job(sched_job);
-@@ -147,31 +182,47 @@ static struct dma_fence *v3d_job_run(str
-       return fence;
- }
--static void
--v3d_job_timedout(struct drm_sched_job *sched_job)
-+static struct dma_fence *
-+v3d_tfu_job_run(struct drm_sched_job *sched_job)
- {
--      struct v3d_job *job = to_v3d_job(sched_job);
--      struct v3d_exec_info *exec = job->exec;
--      struct v3d_dev *v3d = exec->v3d;
--      enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
--      enum v3d_queue q;
--      u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
--      u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
-+      struct v3d_tfu_job *job = to_tfu_job(sched_job);
-+      struct v3d_dev *v3d = job->v3d;
-+      struct drm_device *dev = &v3d->drm;
-+      struct dma_fence *fence;
--      /* If the current address or return address have changed, then
--       * the GPU has probably made progress and we should delay the
--       * reset.  This could fail if the GPU got in an infinite loop
--       * in the CL, but that is pretty unlikely outside of an i-g-t
--       * testcase.
--       */
--      if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
--              job->timedout_ctca = ctca;
--              job->timedout_ctra = ctra;
-+      fence = v3d_fence_create(v3d, V3D_TFU);
-+      if (IS_ERR(fence))
-+              return NULL;
--              schedule_delayed_work(&job->base.work_tdr,
--                                    job->base.sched->timeout);
--              return;
-+      v3d->tfu_job = job;
-+      if (job->done_fence)
-+              dma_fence_put(job->done_fence);
-+      job->done_fence = dma_fence_get(fence);
-+
-+      trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
-+
-+      V3D_WRITE(V3D_TFU_IIA, job->args.iia);
-+      V3D_WRITE(V3D_TFU_IIS, job->args.iis);
-+      V3D_WRITE(V3D_TFU_ICA, job->args.ica);
-+      V3D_WRITE(V3D_TFU_IUA, job->args.iua);
-+      V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
-+      V3D_WRITE(V3D_TFU_IOS, job->args.ios);
-+      V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
-+      if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
-+              V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
-+              V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
-+              V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
-       }
-+      /* ICFG kicks off the job. */
-+      V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
-+
-+      return fence;
-+}
-+
-+static void
-+v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
-+{
-+      enum v3d_queue q;
-       mutex_lock(&v3d->reset_lock);
-@@ -196,6 +247,41 @@ v3d_job_timedout(struct drm_sched_job *s
-       mutex_unlock(&v3d->reset_lock);
- }
-+static void
-+v3d_job_timedout(struct drm_sched_job *sched_job)
-+{
-+      struct v3d_job *job = to_v3d_job(sched_job);
-+      struct v3d_exec_info *exec = job->exec;
-+      struct v3d_dev *v3d = exec->v3d;
-+      enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
-+      u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
-+      u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
-+
-+      /* If the current address or return address have changed, then
-+       * the GPU has probably made progress and we should delay the
-+       * reset.  This could fail if the GPU got in an infinite loop
-+       * in the CL, but that is pretty unlikely outside of an i-g-t
-+       * testcase.
-+       */
-+      if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
-+              job->timedout_ctca = ctca;
-+              job->timedout_ctra = ctra;
-+              schedule_delayed_work(&job->base.work_tdr,
-+                                    job->base.sched->timeout);
-+              return;
-+      }
-+
-+      v3d_gpu_reset_for_timeout(v3d, sched_job);
-+}
-+
-+static void
-+v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
-+{
-+      struct v3d_tfu_job *job = to_tfu_job(sched_job);
-+
-+      v3d_gpu_reset_for_timeout(job->v3d, sched_job);
-+}
-+
- static const struct drm_sched_backend_ops v3d_sched_ops = {
-       .dependency = v3d_job_dependency,
-       .run_job = v3d_job_run,
-@@ -203,6 +289,13 @@ static const struct drm_sched_backend_op
-       .free_job = v3d_job_free
- };
-+static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
-+      .dependency = v3d_tfu_job_dependency,
-+      .run_job = v3d_tfu_job_run,
-+      .timedout_job = v3d_tfu_job_timedout,
-+      .free_job = v3d_tfu_job_free
-+};
-+
- int
- v3d_sched_init(struct v3d_dev *v3d)
- {
-@@ -232,6 +325,19 @@ v3d_sched_init(struct v3d_dev *v3d)
-               drm_sched_fini(&v3d->queue[V3D_BIN].sched);
-               return ret;
-       }
-+
-+      ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
-+                           &v3d_tfu_sched_ops,
-+                           hw_jobs_limit, job_hang_limit,
-+                           msecs_to_jiffies(hang_limit_ms),
-+                           "v3d_tfu");
-+      if (ret) {
-+              dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
-+                      ret);
-+              drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
-+              drm_sched_fini(&v3d->queue[V3D_BIN].sched);
-+              return ret;
-+      }
-       return 0;
- }
---- a/drivers/gpu/drm/v3d/v3d_trace.h
-+++ b/drivers/gpu/drm/v3d/v3d_trace.h
-@@ -42,6 +42,26 @@ TRACE_EVENT(v3d_submit_cl,
-                     __entry->ctnqea)
- );
-+TRACE_EVENT(v3d_submit_tfu,
-+          TP_PROTO(struct drm_device *dev,
-+                   uint64_t seqno),
-+          TP_ARGS(dev, seqno),
-+
-+          TP_STRUCT__entry(
-+                           __field(u32, dev)
-+                           __field(u64, seqno)
-+                           ),
-+
-+          TP_fast_assign(
-+                         __entry->dev = dev->primary->index;
-+                         __entry->seqno = seqno;
-+                         ),
-+
-+          TP_printk("dev=%u, seqno=%llu",
-+                    __entry->dev,
-+                    __entry->seqno)
-+);
-+
- TRACE_EVENT(v3d_reset_begin,
-           TP_PROTO(struct drm_device *dev),
-           TP_ARGS(dev),
---- a/include/uapi/drm/v3d_drm.h
-+++ b/include/uapi/drm/v3d_drm.h
-@@ -36,6 +36,7 @@ extern "C" {
- #define DRM_V3D_MMAP_BO                           0x03
- #define DRM_V3D_GET_PARAM                         0x04
- #define DRM_V3D_GET_BO_OFFSET                     0x05
-+#define DRM_V3D_SUBMIT_TFU                        0x06
- #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
- #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
-@@ -43,6 +44,7 @@ extern "C" {
- #define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
- #define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
- #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
-+#define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
- /**
-  * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
-@@ -169,6 +171,7 @@ enum drm_v3d_param {
-       DRM_V3D_PARAM_V3D_CORE0_IDENT0,
-       DRM_V3D_PARAM_V3D_CORE0_IDENT1,
-       DRM_V3D_PARAM_V3D_CORE0_IDENT2,
-+      DRM_V3D_PARAM_SUPPORTS_TFU,
- };
- struct drm_v3d_get_param {
-@@ -187,6 +190,28 @@ struct drm_v3d_get_bo_offset {
-       __u32 offset;
- };
-+struct drm_v3d_submit_tfu {
-+      __u32 icfg;
-+      __u32 iia;
-+      __u32 iis;
-+      __u32 ica;
-+      __u32 iua;
-+      __u32 ioa;
-+      __u32 ios;
-+      __u32 coef[4];
-+      /* First handle is the output BO, following are other inputs.
-+       * 0 for unused.
-+       */
-+      __u32 bo_handles[4];
-+      /* sync object to block on before running the TFU job.  Each TFU
-+       * job will execute in the order submitted to its FD.  Synchronization
-+       * against rendering jobs requires using sync objects.
-+       */
-+      __u32 in_sync;
-+      /* Sync object to signal when the TFU job is done. */
-+      __u32 out_sync;
-+};
-+
- #if defined(__cplusplus)
- }
- #endif