target/linux/brcm2708/patches-4.19/950-0566-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch

   1 From 5d80273397b13617211ac6dd1e0e9759fff0470d Mon Sep 17 00:00:00 2001
   2 From: Eric Anholt <eric@anholt.net>
   3 Date: Wed, 28 Nov 2018 15:09:25 -0800
   4 Subject: [PATCH 566/725] drm/v3d: Add support for submitting jobs to the TFU.
   5
   6 The TFU can copy from raster, UIF, and SAND input images to UIF output
   7 images, with optional mipmap generation.  This will certainly be
   8 useful for media EGL image input, but is also useful immediately for
   9 mipmap generation without bogging the V3D core down.
  10
  11 For now we only run the queue 1 job deep, and don't have any hang
  12 recovery (though I don't think we should need it, with TFU).  Queuing
  13 multiple jobs in the HW will require synchronizing the YUV coefficient
  14 regs updates since they don't get FIFOed with the job.
  15
  16 v2: Change the ioctl to IOW instead of IOWR, always set COEF0, explain
  17     why TFU is AUTH, clarify the syncing docs, drop the unused TFU
  18     interrupt regs (you're expected to use the hub's), don't take
  19     &bo->base for NULL bos.
  20 v3: Fix a little whitespace alignment (noticed by checkpatch), rebase
  21     on drm_sched_job_cleanup() changes.
  22
  23 Signed-off-by: Eric Anholt <eric@anholt.net>
  24 Reviewed-by: Dave Emett <david.emett@broadcom.com> (v2)
  25 Link: https://patchwork.freedesktop.org/patch/264607/
  26 (cherry picked from commit 1584f16ca96ef124aad79efa3303cff5f3530e2c)
  27 ---
  28  drivers/gpu/drm/v3d/v3d_drv.c   |  15 ++-
  29  drivers/gpu/drm/v3d/v3d_drv.h   |  32 +++++-
  30  drivers/gpu/drm/v3d/v3d_gem.c   | 178 ++++++++++++++++++++++++++++----
  31  drivers/gpu/drm/v3d/v3d_irq.c   |  12 ++-
  32  drivers/gpu/drm/v3d/v3d_regs.h  |  49 +++++++++
  33  drivers/gpu/drm/v3d/v3d_sched.c | 148 ++++++++++++++++++++++----
  34  drivers/gpu/drm/v3d/v3d_trace.h |  20 ++++
  35  include/uapi/drm/v3d_drm.h      |  25 +++++
  36  8 files changed, 426 insertions(+), 53 deletions(-)
  37
  38 --- a/drivers/gpu/drm/v3d/v3d_drv.c
  39 +++ b/drivers/gpu/drm/v3d/v3d_drv.c
  40 @@ -112,10 +112,15 @@ static int v3d_get_param_ioctl(struct dr
  41                 return 0;
  42         }
  43
  44 -       /* Any params that aren't just register reads would go here. */
  45
  46 -       DRM_DEBUG("Unknown parameter %d\n", args->param);
  47 -       return -EINVAL;
  48 +       switch (args->param) {
  49 +       case DRM_V3D_PARAM_SUPPORTS_TFU:
  50 +               args->value = 1;
  51 +               return 0;
  52 +       default:
  53 +               DRM_DEBUG("Unknown parameter %d\n", args->param);
  54 +               return -EINVAL;
  55 +       }
  56  }
  57
  58  static int
  59 @@ -170,7 +175,8 @@ static const struct file_operations v3d_
  60  /* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
  61   * protection between clients.  Note that render nodes would be be
  62   * able to submit CLs that could access BOs from clients authenticated
  63 - * with the master node.
  64 + * with the master node.  The TFU doesn't use the GMP, so it would
  65 + * need to stay DRM_AUTH until we do buffer size/offset validation.
  66   */
  67  static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
  68         DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
  69 @@ -179,6 +185,7 @@ static const struct drm_ioctl_desc v3d_d
  70         DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
  71         DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
  72         DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
  73 +       DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
  74  };
  75
  76  static const struct vm_operations_struct v3d_vm_ops = {
  77 --- a/drivers/gpu/drm/v3d/v3d_drv.h
  78 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
  79 @@ -7,19 +7,18 @@
  80  #include <drm/drm_encoder.h>
  81  #include <drm/drm_gem.h>
  82  #include <drm/gpu_scheduler.h>
  83 +#include "uapi/drm/v3d_drm.h"
  84
  85  #define GMP_GRANULARITY (128 * 1024)
  86
  87 -/* Enum for each of the V3D queues.  We maintain various queue
  88 - * tracking as an array because at some point we'll want to support
  89 - * the TFU (texture formatting unit) as another queue.
  90 - */
  91 +/* Enum for each of the V3D queues. */
  92  enum v3d_queue {
  93         V3D_BIN,
  94         V3D_RENDER,
  95 +       V3D_TFU,
  96  };
  97
  98 -#define V3D_MAX_QUEUES (V3D_RENDER + 1)
  99 +#define V3D_MAX_QUEUES (V3D_TFU + 1)
 100
 101  struct v3d_queue_state {
 102         struct drm_gpu_scheduler sched;
 103 @@ -68,6 +67,7 @@ struct v3d_dev {
 104
 105         struct v3d_exec_info *bin_job;
 106         struct v3d_exec_info *render_job;
 107 +       struct v3d_tfu_job *tfu_job;
 108
 109         struct v3d_queue_state queue[V3D_MAX_QUEUES];
 110
 111 @@ -218,6 +218,25 @@ struct v3d_exec_info {
 112         u32 qma, qms, qts;
 113  };
 114
 115 +struct v3d_tfu_job {
 116 +       struct drm_sched_job base;
 117 +
 118 +       struct drm_v3d_submit_tfu args;
 119 +
 120 +       /* An optional fence userspace can pass in for the job to depend on. */
 121 +       struct dma_fence *in_fence;
 122 +
 123 +       /* v3d fence to be signaled by IRQ handler when the job is complete. */
 124 +       struct dma_fence *done_fence;
 125 +
 126 +       struct v3d_dev *v3d;
 127 +
 128 +       struct kref refcount;
 129 +
 130 +       /* This is the array of BOs that were looked up at the start of exec. */
 131 +       struct v3d_bo *bo[4];
 132 +};
 133 +
 134  /**
 135   * _wait_for - magic (register) wait macro
 136   *
 137 @@ -281,9 +300,12 @@ int v3d_gem_init(struct drm_device *dev)
 138  void v3d_gem_destroy(struct drm_device *dev);
 139  int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 140                         struct drm_file *file_priv);
 141 +int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
 142 +                        struct drm_file *file_priv);
 143  int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
 144                       struct drm_file *file_priv);
 145  void v3d_exec_put(struct v3d_exec_info *exec);
 146 +void v3d_tfu_job_put(struct v3d_tfu_job *exec);
 147  void v3d_reset(struct v3d_dev *v3d);
 148  void v3d_invalidate_caches(struct v3d_dev *v3d);
 149  void v3d_flush_caches(struct v3d_dev *v3d);
 150 --- a/drivers/gpu/drm/v3d/v3d_gem.c
 151 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
 152 @@ -207,26 +207,27 @@ v3d_flush_caches(struct v3d_dev *v3d)
 153  }
 154
 155  static void
 156 -v3d_attach_object_fences(struct v3d_exec_info *exec)
 157 +v3d_attach_object_fences(struct v3d_bo **bos, int bo_count,
 158 +                        struct dma_fence *fence)
 159  {
 160 -       struct dma_fence *out_fence = exec->render_done_fence;
 161         int i;
 162
 163 -       for (i = 0; i < exec->bo_count; i++) {
 164 +       for (i = 0; i < bo_count; i++) {
 165                 /* XXX: Use shared fences for read-only objects. */
 166 -               reservation_object_add_excl_fence(exec->bo[i]->resv, out_fence);
 167 +               reservation_object_add_excl_fence(bos[i]->resv, fence);
 168         }
 169  }
 170
 171  static void
 172  v3d_unlock_bo_reservations(struct drm_device *dev,
 173 -                          struct v3d_exec_info *exec,
 174 +                          struct v3d_bo **bos,
 175 +                          int bo_count,
 176                            struct ww_acquire_ctx *acquire_ctx)
 177  {
 178         int i;
 179
 180 -       for (i = 0; i < exec->bo_count; i++)
 181 -               ww_mutex_unlock(&exec->bo[i]->resv->lock);
 182 +       for (i = 0; i < bo_count; i++)
 183 +               ww_mutex_unlock(&bos[i]->resv->lock);
 184
 185         ww_acquire_fini(acquire_ctx);
 186  }
 187 @@ -240,7 +241,8 @@ v3d_unlock_bo_reservations(struct drm_de
 188   */
 189  static int
 190  v3d_lock_bo_reservations(struct drm_device *dev,
 191 -                        struct v3d_exec_info *exec,
 192 +                        struct v3d_bo **bos,
 193 +                        int bo_count,
 194                          struct ww_acquire_ctx *acquire_ctx)
 195  {
 196         int contended_lock = -1;
 197 @@ -250,7 +252,7 @@ v3d_lock_bo_reservations(struct drm_devi
 198
 199  retry:
 200         if (contended_lock != -1) {
 201 -               struct v3d_bo *bo = exec->bo[contended_lock];
 202 +               struct v3d_bo *bo = bos[contended_lock];
 203
 204                 ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
 205                                                        acquire_ctx);
 206 @@ -260,20 +262,20 @@ retry:
 207                 }
 208         }
 209
 210 -       for (i = 0; i < exec->bo_count; i++) {
 211 +       for (i = 0; i < bo_count; i++) {
 212                 if (i == contended_lock)
 213                         continue;
 214
 215 -               ret = ww_mutex_lock_interruptible(&exec->bo[i]->resv->lock,
 216 +               ret = ww_mutex_lock_interruptible(&bos[i]->resv->lock,
 217                                                   acquire_ctx);
 218                 if (ret) {
 219                         int j;
 220
 221                         for (j = 0; j < i; j++)
 222 -                               ww_mutex_unlock(&exec->bo[j]->resv->lock);
 223 +                               ww_mutex_unlock(&bos[j]->resv->lock);
 224
 225                         if (contended_lock != -1 && contended_lock >= i) {
 226 -                               struct v3d_bo *bo = exec->bo[contended_lock];
 227 +                               struct v3d_bo *bo = bos[contended_lock];
 228
 229                                 ww_mutex_unlock(&bo->resv->lock);
 230                         }
 231 @@ -293,10 +295,11 @@ retry:
 232         /* Reserve space for our shared (read-only) fence references,
 233          * before we commit the CL to the hardware.
 234          */
 235 -       for (i = 0; i < exec->bo_count; i++) {
 236 -               ret = reservation_object_reserve_shared(exec->bo[i]->resv);
 237 +       for (i = 0; i < bo_count; i++) {
 238 +               ret = reservation_object_reserve_shared(bos[i]->resv);
 239                 if (ret) {
 240 -                       v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
 241 +                       v3d_unlock_bo_reservations(dev, bos, bo_count,
 242 +                                                  acquire_ctx);
 243                         return ret;
 244                 }
 245         }
 246 @@ -419,6 +422,33 @@ void v3d_exec_put(struct v3d_exec_info *
 247         kref_put(&exec->refcount, v3d_exec_cleanup);
 248  }
 249
 250 +static void
 251 +v3d_tfu_job_cleanup(struct kref *ref)
 252 +{
 253 +       struct v3d_tfu_job *job = container_of(ref, struct v3d_tfu_job,
 254 +                                              refcount);
 255 +       struct v3d_dev *v3d = job->v3d;
 256 +       unsigned int i;
 257 +
 258 +       dma_fence_put(job->in_fence);
 259 +       dma_fence_put(job->done_fence);
 260 +
 261 +       for (i = 0; i < ARRAY_SIZE(job->bo); i++) {
 262 +               if (job->bo[i])
 263 +                       drm_gem_object_put_unlocked(&job->bo[i]->base);
 264 +       }
 265 +
 266 +       pm_runtime_mark_last_busy(v3d->dev);
 267 +       pm_runtime_put_autosuspend(v3d->dev);
 268 +
 269 +       kfree(job);
 270 +}
 271 +
 272 +void v3d_tfu_job_put(struct v3d_tfu_job *job)
 273 +{
 274 +       kref_put(&job->refcount, v3d_tfu_job_cleanup);
 275 +}
 276 +
 277  int
 278  v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
 279                   struct drm_file *file_priv)
 280 @@ -536,7 +566,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
 281         if (ret)
 282                 goto fail;
 283
 284 -       ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
 285 +       ret = v3d_lock_bo_reservations(dev, exec->bo, exec->bo_count,
 286 +                                      &acquire_ctx);
 287         if (ret)
 288                 goto fail;
 289
 290 @@ -570,9 +601,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
 291                                   &v3d_priv->sched_entity[V3D_RENDER]);
 292         mutex_unlock(&v3d->sched_lock);
 293
 294 -       v3d_attach_object_fences(exec);
 295 +       v3d_attach_object_fences(exec->bo, exec->bo_count,
 296 +                                exec->render_done_fence);
 297
 298 -       v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
 299 +       v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
 300
 301         /* Update the return sync object for the */
 302         sync_out = drm_syncobj_find(file_priv, args->out_sync);
 303 @@ -588,12 +620,118 @@ v3d_submit_cl_ioctl(struct drm_device *d
 304
 305  fail_unreserve:
 306         mutex_unlock(&v3d->sched_lock);
 307 -       v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
 308 +       v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
 309  fail:
 310         v3d_exec_put(exec);
 311
 312         return ret;
 313  }
 314 +
 315 +/**
 316 + * v3d_submit_tfu_ioctl() - Submits a TFU (texture formatting) job to the V3D.
 317 + * @dev: DRM device
 318 + * @data: ioctl argument
 319 + * @file_priv: DRM file for this fd
 320 + *
 321 + * Userspace provides the register setup for the TFU, which we don't
 322 + * need to validate since the TFU is behind the MMU.
 323 + */
 324 +int
 325 +v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
 326 +                    struct drm_file *file_priv)
 327 +{
 328 +       struct v3d_dev *v3d = to_v3d_dev(dev);
 329 +       struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
 330 +       struct drm_v3d_submit_tfu *args = data;
 331 +       struct v3d_tfu_job *job;
 332 +       struct ww_acquire_ctx acquire_ctx;
 333 +       struct drm_syncobj *sync_out;
 334 +       struct dma_fence *sched_done_fence;
 335 +       int ret = 0;
 336 +       int bo_count;
 337 +
 338 +       job = kcalloc(1, sizeof(*job), GFP_KERNEL);
 339 +       if (!job)
 340 +               return -ENOMEM;
 341 +
 342 +       ret = pm_runtime_get_sync(v3d->dev);
 343 +       if (ret < 0) {
 344 +               kfree(job);
 345 +               return ret;
 346 +       }
 347 +
 348 +       kref_init(&job->refcount);
 349 +
 350 +       ret = drm_syncobj_find_fence(file_priv, args->in_sync,
 351 +                                    0, &job->in_fence);
 352 +       if (ret == -EINVAL)
 353 +               goto fail;
 354 +
 355 +       job->args = *args;
 356 +       job->v3d = v3d;
 357 +
 358 +       spin_lock(&file_priv->table_lock);
 359 +       for (bo_count = 0; bo_count < ARRAY_SIZE(job->bo); bo_count++) {
 360 +               struct drm_gem_object *bo;
 361 +
 362 +               if (!args->bo_handles[bo_count])
 363 +                       break;
 364 +
 365 +               bo = idr_find(&file_priv->object_idr,
 366 +                             args->bo_handles[bo_count]);
 367 +               if (!bo) {
 368 +                       DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
 369 +                                 bo_count, args->bo_handles[bo_count]);
 370 +                       ret = -ENOENT;
 371 +                       spin_unlock(&file_priv->table_lock);
 372 +                       goto fail;
 373 +               }
 374 +               drm_gem_object_get(bo);
 375 +               job->bo[bo_count] = to_v3d_bo(bo);
 376 +       }
 377 +       spin_unlock(&file_priv->table_lock);
 378 +
 379 +       ret = v3d_lock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
 380 +       if (ret)
 381 +               goto fail;
 382 +
 383 +       mutex_lock(&v3d->sched_lock);
 384 +       ret = drm_sched_job_init(&job->base,
 385 +                                &v3d_priv->sched_entity[V3D_TFU],
 386 +                                v3d_priv);
 387 +       if (ret)
 388 +               goto fail_unreserve;
 389 +
 390 +       sched_done_fence = dma_fence_get(&job->base.s_fence->finished);
 391 +
 392 +       kref_get(&job->refcount); /* put by scheduler job completion */
 393 +       drm_sched_entity_push_job(&job->base, &v3d_priv->sched_entity[V3D_TFU]);
 394 +       mutex_unlock(&v3d->sched_lock);
 395 +
 396 +       v3d_attach_object_fences(job->bo, bo_count, sched_done_fence);
 397 +
 398 +       v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
 399 +
 400 +       /* Update the return sync object */
 401 +       sync_out = drm_syncobj_find(file_priv, args->out_sync);
 402 +       if (sync_out) {
 403 +               drm_syncobj_replace_fence(sync_out, sched_done_fence);
 404 +               drm_syncobj_put(sync_out);
 405 +       }
 406 +       dma_fence_put(sched_done_fence);
 407 +
 408 +       v3d_tfu_job_put(job);
 409 +
 410 +       return 0;
 411 +
 412 +fail_unreserve:
 413 +       mutex_unlock(&v3d->sched_lock);
 414 +       v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
 415 +fail:
 416 +       v3d_tfu_job_put(job);
 417 +
 418 +       return ret;
 419 +}
 420
 421  int
 422  v3d_gem_init(struct drm_device *dev)
 423 --- a/drivers/gpu/drm/v3d/v3d_irq.c
 424 +++ b/drivers/gpu/drm/v3d/v3d_irq.c
 425 @@ -4,8 +4,8 @@
 426  /**
 427   * DOC: Interrupt management for the V3D engine
 428   *
 429 - * When we take a binning or rendering flush done interrupt, we need
 430 - * to signal the fence for that job so that the scheduler can queue up
 431 + * When we take a bin, render, or TFU done interrupt, we need to
 432 + * signal the fence for that job so that the scheduler can queue up
 433   * the next one and unblock any waiters.
 434   *
 435   * When we take the binner out of memory interrupt, we need to
 436 @@ -23,7 +23,8 @@
 437
 438  #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV |      \
 439                             V3D_HUB_INT_MMU_PTI |       \
 440 -                           V3D_HUB_INT_MMU_CAP))
 441 +                           V3D_HUB_INT_MMU_CAP |       \
 442 +                           V3D_HUB_INT_TFUC))
 443
 444  static void
 445  v3d_overflow_mem_work(struct work_struct *work)
 446 @@ -117,6 +118,11 @@ v3d_hub_irq(int irq, void *arg)
 447         /* Acknowledge the interrupts we're handling here. */
 448         V3D_WRITE(V3D_HUB_INT_CLR, intsts);
 449
 450 +       if (intsts & V3D_HUB_INT_TFUC) {
 451 +               dma_fence_signal(v3d->tfu_job->done_fence);
 452 +               status = IRQ_HANDLED;
 453 +       }
 454 +
 455         if (intsts & (V3D_HUB_INT_MMU_WRV |
 456                       V3D_HUB_INT_MMU_PTI |
 457                       V3D_HUB_INT_MMU_CAP)) {
 458 --- a/drivers/gpu/drm/v3d/v3d_regs.h
 459 +++ b/drivers/gpu/drm/v3d/v3d_regs.h
 460 @@ -86,6 +86,55 @@
 461  # define V3D_TOP_GR_BRIDGE_SW_INIT_1                   0x0000c
 462  # define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
 463
 464 +#define V3D_TFU_CS                                     0x00400
 465 +/* Stops current job, empties input fifo. */
 466 +# define V3D_TFU_CS_TFURST                             BIT(31)
 467 +# define V3D_TFU_CS_CVTCT_MASK                         V3D_MASK(23, 16)
 468 +# define V3D_TFU_CS_CVTCT_SHIFT                        16
 469 +# define V3D_TFU_CS_NFREE_MASK                         V3D_MASK(13, 8)
 470 +# define V3D_TFU_CS_NFREE_SHIFT                        8
 471 +# define V3D_TFU_CS_BUSY                               BIT(0)
 472 +
 473 +#define V3D_TFU_SU                                     0x00404
 474 +/* Interrupt when FINTTHR input slots are free (0 = disabled) */
 475 +# define V3D_TFU_SU_FINTTHR_MASK                       V3D_MASK(13, 8)
 476 +# define V3D_TFU_SU_FINTTHR_SHIFT                      8
 477 +/* Skips resetting the CRC at the start of CRC generation. */
 478 +# define V3D_TFU_SU_CRCCHAIN                           BIT(4)
 479 +/* skips writes, computes CRC of the image.  miplevels must be 0. */
 480 +# define V3D_TFU_SU_CRC                                BIT(3)
 481 +# define V3D_TFU_SU_THROTTLE_MASK                      V3D_MASK(1, 0)
 482 +# define V3D_TFU_SU_THROTTLE_SHIFT                     0
 483 +
 484 +#define V3D_TFU_ICFG                                   0x00408
 485 +/* Interrupt when the conversion is complete. */
 486 +# define V3D_TFU_ICFG_IOC                              BIT(0)
 487 +
 488 +/* Input Image Address */
 489 +#define V3D_TFU_IIA                                    0x0040c
 490 +/* Input Chroma Address */
 491 +#define V3D_TFU_ICA                                    0x00410
 492 +/* Input Image Stride */
 493 +#define V3D_TFU_IIS                                    0x00414
 494 +/* Input Image U-Plane Address */
 495 +#define V3D_TFU_IUA                                    0x00418
 496 +/* Output Image Address */
 497 +#define V3D_TFU_IOA                                    0x0041c
 498 +/* Image Output Size */
 499 +#define V3D_TFU_IOS                                    0x00420
 500 +/* TFU YUV Coefficient 0 */
 501 +#define V3D_TFU_COEF0                                  0x00424
 502 +/* Use these regs instead of the defaults. */
 503 +# define V3D_TFU_COEF0_USECOEF                         BIT(31)
 504 +/* TFU YUV Coefficient 1 */
 505 +#define V3D_TFU_COEF1                                  0x00428
 506 +/* TFU YUV Coefficient 2 */
 507 +#define V3D_TFU_COEF2                                  0x0042c
 508 +/* TFU YUV Coefficient 3 */
 509 +#define V3D_TFU_COEF3                                  0x00430
 510 +
 511 +#define V3D_TFU_CRC                                    0x00434
 512 +
 513  /* Per-MMU registers. */
 514
 515  #define V3D_MMUC_CONTROL                               0x01000
 516 --- a/drivers/gpu/drm/v3d/v3d_sched.c
 517 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
 518 @@ -30,6 +30,12 @@ to_v3d_job(struct drm_sched_job *sched_j
 519         return container_of(sched_job, struct v3d_job, base);
 520  }
 521
 522 +static struct v3d_tfu_job *
 523 +to_tfu_job(struct drm_sched_job *sched_job)
 524 +{
 525 +       return container_of(sched_job, struct v3d_tfu_job, base);
 526 +}
 527 +
 528  static void
 529  v3d_job_free(struct drm_sched_job *sched_job)
 530  {
 531 @@ -38,6 +44,14 @@ v3d_job_free(struct drm_sched_job *sched
 532         v3d_exec_put(job->exec);
 533  }
 534
 535 +static void
 536 +v3d_tfu_job_free(struct drm_sched_job *sched_job)
 537 +{
 538 +       struct v3d_tfu_job *job = to_tfu_job(sched_job);
 539 +
 540 +       v3d_tfu_job_put(job);
 541 +}
 542 +
 543  /**
 544   * Returns the fences that the bin or render job depends on, one by one.
 545   * v3d_job_run() won't be called until all of them have been signaled.
 546 @@ -76,6 +90,27 @@ v3d_job_dependency(struct drm_sched_job
 547         return fence;
 548  }
 549
 550 +/**
 551 + * Returns the fences that the TFU job depends on, one by one.
 552 + * v3d_tfu_job_run() won't be called until all of them have been
 553 + * signaled.
 554 + */
 555 +static struct dma_fence *
 556 +v3d_tfu_job_dependency(struct drm_sched_job *sched_job,
 557 +                      struct drm_sched_entity *s_entity)
 558 +{
 559 +       struct v3d_tfu_job *job = to_tfu_job(sched_job);
 560 +       struct dma_fence *fence;
 561 +
 562 +       fence = job->in_fence;
 563 +       if (fence) {
 564 +               job->in_fence = NULL;
 565 +               return fence;
 566 +       }
 567 +
 568 +       return NULL;
 569 +}
 570 +
 571  static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
 572  {
 573         struct v3d_job *job = to_v3d_job(sched_job);
 574 @@ -147,31 +182,47 @@ static struct dma_fence *v3d_job_run(str
 575         return fence;
 576  }
 577
 578 -static void
 579 -v3d_job_timedout(struct drm_sched_job *sched_job)
 580 +static struct dma_fence *
 581 +v3d_tfu_job_run(struct drm_sched_job *sched_job)
 582  {
 583 -       struct v3d_job *job = to_v3d_job(sched_job);
 584 -       struct v3d_exec_info *exec = job->exec;
 585 -       struct v3d_dev *v3d = exec->v3d;
 586 -       enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
 587 -       enum v3d_queue q;
 588 -       u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
 589 -       u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
 590 +       struct v3d_tfu_job *job = to_tfu_job(sched_job);
 591 +       struct v3d_dev *v3d = job->v3d;
 592 +       struct drm_device *dev = &v3d->drm;
 593 +       struct dma_fence *fence;
 594
 595 -       /* If the current address or return address have changed, then
 596 -        * the GPU has probably made progress and we should delay the
 597 -        * reset.  This could fail if the GPU got in an infinite loop
 598 -        * in the CL, but that is pretty unlikely outside of an i-g-t
 599 -        * testcase.
 600 -        */
 601 -       if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
 602 -               job->timedout_ctca = ctca;
 603 -               job->timedout_ctra = ctra;
 604 +       fence = v3d_fence_create(v3d, V3D_TFU);
 605 +       if (IS_ERR(fence))
 606 +               return NULL;
 607
 608 -               schedule_delayed_work(&job->base.work_tdr,
 609 -                                     job->base.sched->timeout);
 610 -               return;
 611 +       v3d->tfu_job = job;
 612 +       if (job->done_fence)
 613 +               dma_fence_put(job->done_fence);
 614 +       job->done_fence = dma_fence_get(fence);
 615 +
 616 +       trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
 617 +
 618 +       V3D_WRITE(V3D_TFU_IIA, job->args.iia);
 619 +       V3D_WRITE(V3D_TFU_IIS, job->args.iis);
 620 +       V3D_WRITE(V3D_TFU_ICA, job->args.ica);
 621 +       V3D_WRITE(V3D_TFU_IUA, job->args.iua);
 622 +       V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
 623 +       V3D_WRITE(V3D_TFU_IOS, job->args.ios);
 624 +       V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
 625 +       if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
 626 +               V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
 627 +               V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
 628 +               V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
 629         }
 630 +       /* ICFG kicks off the job. */
 631 +       V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
 632 +
 633 +       return fence;
 634 +}
 635 +
 636 +static void
 637 +v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
 638 +{
 639 +       enum v3d_queue q;
 640
 641         mutex_lock(&v3d->reset_lock);
 642
 643 @@ -196,6 +247,41 @@ v3d_job_timedout(struct drm_sched_job *s
 644         mutex_unlock(&v3d->reset_lock);
 645  }
 646
 647 +static void
 648 +v3d_job_timedout(struct drm_sched_job *sched_job)
 649 +{
 650 +       struct v3d_job *job = to_v3d_job(sched_job);
 651 +       struct v3d_exec_info *exec = job->exec;
 652 +       struct v3d_dev *v3d = exec->v3d;
 653 +       enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
 654 +       u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
 655 +       u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
 656 +
 657 +       /* If the current address or return address have changed, then
 658 +        * the GPU has probably made progress and we should delay the
 659 +        * reset.  This could fail if the GPU got in an infinite loop
 660 +        * in the CL, but that is pretty unlikely outside of an i-g-t
 661 +        * testcase.
 662 +        */
 663 +       if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
 664 +               job->timedout_ctca = ctca;
 665 +               job->timedout_ctra = ctra;
 666 +               schedule_delayed_work(&job->base.work_tdr,
 667 +                                     job->base.sched->timeout);
 668 +               return;
 669 +       }
 670 +
 671 +       v3d_gpu_reset_for_timeout(v3d, sched_job);
 672 +}
 673 +
 674 +static void
 675 +v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
 676 +{
 677 +       struct v3d_tfu_job *job = to_tfu_job(sched_job);
 678 +
 679 +       v3d_gpu_reset_for_timeout(job->v3d, sched_job);
 680 +}
 681 +
 682  static const struct drm_sched_backend_ops v3d_sched_ops = {
 683         .dependency = v3d_job_dependency,
 684         .run_job = v3d_job_run,
 685 @@ -203,6 +289,13 @@ static const struct drm_sched_backend_op
 686         .free_job = v3d_job_free
 687  };
 688
 689 +static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
 690 +       .dependency = v3d_tfu_job_dependency,
 691 +       .run_job = v3d_tfu_job_run,
 692 +       .timedout_job = v3d_tfu_job_timedout,
 693 +       .free_job = v3d_tfu_job_free
 694 +};
 695 +
 696  int
 697  v3d_sched_init(struct v3d_dev *v3d)
 698  {
 699 @@ -232,6 +325,19 @@ v3d_sched_init(struct v3d_dev *v3d)
 700                 drm_sched_fini(&v3d->queue[V3D_BIN].sched);
 701                 return ret;
 702         }
 703 +
 704 +       ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
 705 +                            &v3d_tfu_sched_ops,
 706 +                            hw_jobs_limit, job_hang_limit,
 707 +                            msecs_to_jiffies(hang_limit_ms),
 708 +                            "v3d_tfu");
 709 +       if (ret) {
 710 +               dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
 711 +                       ret);
 712 +               drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
 713 +               drm_sched_fini(&v3d->queue[V3D_BIN].sched);
 714 +               return ret;
 715 +       }
 716
 717         return 0;
 718  }
 719 --- a/drivers/gpu/drm/v3d/v3d_trace.h
 720 +++ b/drivers/gpu/drm/v3d/v3d_trace.h
 721 @@ -42,6 +42,26 @@ TRACE_EVENT(v3d_submit_cl,
 722                       __entry->ctnqea)
 723  );
 724
 725 +TRACE_EVENT(v3d_submit_tfu,
 726 +           TP_PROTO(struct drm_device *dev,
 727 +                    uint64_t seqno),
 728 +           TP_ARGS(dev, seqno),
 729 +
 730 +           TP_STRUCT__entry(
 731 +                            __field(u32, dev)
 732 +                            __field(u64, seqno)
 733 +                            ),
 734 +
 735 +           TP_fast_assign(
 736 +                          __entry->dev = dev->primary->index;
 737 +                          __entry->seqno = seqno;
 738 +                          ),
 739 +
 740 +           TP_printk("dev=%u, seqno=%llu",
 741 +                     __entry->dev,
 742 +                     __entry->seqno)
 743 +);
 744 +
 745  TRACE_EVENT(v3d_reset_begin,
 746             TP_PROTO(struct drm_device *dev),
 747             TP_ARGS(dev),
 748 --- a/include/uapi/drm/v3d_drm.h
 749 +++ b/include/uapi/drm/v3d_drm.h
 750 @@ -36,6 +36,7 @@ extern "C" {
 751  #define DRM_V3D_MMAP_BO                           0x03
 752  #define DRM_V3D_GET_PARAM                         0x04
 753  #define DRM_V3D_GET_BO_OFFSET                     0x05
 754 +#define DRM_V3D_SUBMIT_TFU                        0x06
 755
 756  #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
 757  #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
 758 @@ -43,6 +44,7 @@ extern "C" {
 759  #define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
 760  #define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
 761  #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
 762 +#define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
 763
 764  /**
 765   * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
 766 @@ -169,6 +171,7 @@ enum drm_v3d_param {
 767         DRM_V3D_PARAM_V3D_CORE0_IDENT0,
 768         DRM_V3D_PARAM_V3D_CORE0_IDENT1,
 769         DRM_V3D_PARAM_V3D_CORE0_IDENT2,
 770 +       DRM_V3D_PARAM_SUPPORTS_TFU,
 771  };
 772
 773  struct drm_v3d_get_param {
 774 @@ -187,6 +190,28 @@ struct drm_v3d_get_bo_offset {
 775         __u32 offset;
 776  };
 777
 778 +struct drm_v3d_submit_tfu {
 779 +       __u32 icfg;
 780 +       __u32 iia;
 781 +       __u32 iis;
 782 +       __u32 ica;
 783 +       __u32 iua;
 784 +       __u32 ioa;
 785 +       __u32 ios;
 786 +       __u32 coef[4];
 787 +       /* First handle is the output BO, following are other inputs.
 788 +        * 0 for unused.
 789 +        */
 790 +       __u32 bo_handles[4];
 791 +       /* sync object to block on before running the TFU job.  Each TFU
 792 +        * job will execute in the order submitted to its FD.  Synchronization
 793 +        * against rendering jobs requires using sync objects.
 794 +        */
 795 +       __u32 in_sync;
 796 +       /* Sync object to signal when the TFU job is done. */
 797 +       __u32 out_sync;
 798 +};
 799 +
 800  #if defined(__cplusplus)
 801  }
 802  #endif