target/linux/brcm2708/patches-4.19/950-0586-drm-v3d-Add-missing-implicit-synchronization.patch

   1 From 7713f79b0a5473eb0b8456d36b99ae00815dd8a1 Mon Sep 17 00:00:00 2001
   2 From: Eric Anholt <eric@anholt.net>
   3 Date: Wed, 27 Mar 2019 17:44:40 -0700
   4 Subject: [PATCH 586/806] drm/v3d: Add missing implicit synchronization.
   5
   6 It is the expectation of existing userspace (X11 + Mesa, in
   7 particular) that jobs submitted to the kernel against a shared BO will
   8 get implicitly synchronized by their submission order.  If we want to
   9 allow clever userspace to disable implicit synchronization, we should
  10 do that under its own submit flag (as amdgpu and lima do).
  11
  12 Note that we currently only implicitly sync for the rendering pass,
  13 not binning -- if you texture-from-pixmap in the binning vertex shader
  14 (vertex coordinate generation), you'll miss out on synchronization.
  15
  16 Fixes flickering when multiple clients are running in parallel,
  17 particularly GL apps and compositors.
  18
  19 Signed-off-by: Eric Anholt <eric@anholt.net>
  20 ---
  21  drivers/gpu/drm/v3d/v3d_drv.h   | 10 +---
  22  drivers/gpu/drm/v3d/v3d_gem.c   | 98 ++++++++++++++++++++++++++++++---
  23  drivers/gpu/drm/v3d/v3d_sched.c | 45 ++-------------
  24  3 files changed, 96 insertions(+), 57 deletions(-)
  25
  26 --- a/drivers/gpu/drm/v3d/v3d_drv.h
  27 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
  28 @@ -186,8 +186,9 @@ struct v3d_job {
  29         struct v3d_bo **bo;
  30         u32 bo_count;
  31
  32 -       /* An optional fence userspace can pass in for the job to depend on. */
  33 -       struct dma_fence *in_fence;
  34 +       struct dma_fence **deps;
  35 +       int deps_count;
  36 +       int deps_size;
  37
  38         /* v3d fence to be signaled by IRQ handler when the job is complete. */
  39         struct dma_fence *irq_fence;
  40 @@ -219,11 +220,6 @@ struct v3d_bin_job {
  41  struct v3d_render_job {
  42         struct v3d_job base;
  43
  44 -       /* Optional fence for the binner, to depend on before starting
  45 -        * our job.
  46 -        */
  47 -       struct dma_fence *bin_done_fence;
  48 -
  49         /* GPU virtual addresses of the start/end of the CL job. */
  50         u32 start, end;
  51
  52 --- a/drivers/gpu/drm/v3d/v3d_gem.c
  53 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
  54 @@ -218,6 +218,71 @@ v3d_unlock_bo_reservations(struct v3d_bo
  55         ww_acquire_fini(acquire_ctx);
  56  }
  57
  58 +static int
  59 +v3d_add_dep(struct v3d_job *job, struct dma_fence *fence)
  60 +{
  61 +       if (!fence)
  62 +               return 0;
  63 +
  64 +       if (job->deps_size == job->deps_count) {
  65 +               int new_deps_size = max(job->deps_size * 2, 4);
  66 +               struct dma_fence **new_deps =
  67 +                       krealloc(job->deps, new_deps_size * sizeof(*new_deps),
  68 +                                GFP_KERNEL);
  69 +               if (!new_deps) {
  70 +                       dma_fence_put(fence);
  71 +                       return -ENOMEM;
  72 +               }
  73 +
  74 +               job->deps = new_deps;
  75 +               job->deps_size = new_deps_size;
  76 +       }
  77 +
  78 +       job->deps[job->deps_count++] = fence;
  79 +
  80 +       return 0;
  81 +}
  82 +
  83 +/**
  84 + * Adds the required implicit fences before executing the job
  85 + *
  86 + * Userspace (X11 + Mesa) requires that a job submitted against a shared BO
  87 + * from one fd will implicitly synchronize against previous jobs submitted
  88 + * against that BO from other fds.
  89 + *
  90 + * Currently we don't bother trying to track the shared BOs, and instead just
  91 + * sync everything.  However, our synchronization is only for the render pass
  92 + * -- the binning stage (VS coordinate calculations) ignores implicit sync,
  93 + * since using shared buffers for texture coordinates seems unlikely, and
  94 + * implicitly syncing them would break bin/render parallelism.  If we want to
  95 + * fix that, we should introduce a flag when VS texturing has been used in the
  96 + * binning stage, or a set of flags for which BOs are sampled during binning.
  97 + */
  98 +static int
  99 +v3d_add_implicit_fences(struct v3d_job *job, struct v3d_bo *bo)
 100 +{
 101 +       int i, ret, nr_fences;
 102 +       struct dma_fence **fences;
 103 +
 104 +       ret = reservation_object_get_fences_rcu(bo->resv, NULL,
 105 +                                               &nr_fences, &fences);
 106 +       if (ret || !nr_fences)
 107 +               return ret;
 108 +
 109 +       for (i = 0; i < nr_fences; i++) {
 110 +               ret = v3d_add_dep(job, fences[i]);
 111 +               if (ret)
 112 +                       break;
 113 +       }
 114 +
 115 +       /* Free any remaining fences after error. */
 116 +       for (; i < nr_fences; i++)
 117 +               dma_fence_put(fences[i]);
 118 +       kfree(fences);
 119 +
 120 +       return ret;
 121 +}
 122 +
 123  /* Takes the reservation lock on all the BOs being referenced, so that
 124   * at queue submit time we can update the reservations.
 125   *
 126 @@ -226,10 +291,11 @@ v3d_unlock_bo_reservations(struct v3d_bo
 127   * to v3d, so we don't attach dma-buf fences to them.
 128   */
 129  static int
 130 -v3d_lock_bo_reservations(struct v3d_bo **bos,
 131 -                        int bo_count,
 132 +v3d_lock_bo_reservations(struct v3d_job *job,
 133                          struct ww_acquire_ctx *acquire_ctx)
 134  {
 135 +       struct v3d_bo **bos = job->bo;
 136 +       int bo_count = job->bo_count;
 137         int contended_lock = -1;
 138         int i, ret;
 139
 140 @@ -281,6 +347,13 @@ retry:
 141          * before we commit the CL to the hardware.
 142          */
 143         for (i = 0; i < bo_count; i++) {
 144 +               ret = v3d_add_implicit_fences(job, bos[i]);
 145 +               if (ret) {
 146 +                       v3d_unlock_bo_reservations(bos, bo_count,
 147 +                                                  acquire_ctx);
 148 +                       return ret;
 149 +               }
 150 +
 151                 ret = reservation_object_reserve_shared(bos[i]->resv);
 152                 if (ret) {
 153                         v3d_unlock_bo_reservations(bos, bo_count,
 154 @@ -383,7 +456,10 @@ v3d_job_free(struct kref *ref)
 155         }
 156         kvfree(job->bo);
 157
 158 -       dma_fence_put(job->in_fence);
 159 +       for (i = 0; i < job->deps_count; i++)
 160 +               dma_fence_put(job->deps[i]);
 161 +       kfree(job->deps);
 162 +
 163         dma_fence_put(job->irq_fence);
 164         dma_fence_put(job->done_fence);
 165
 166 @@ -464,15 +540,20 @@ v3d_job_init(struct v3d_dev *v3d, struct
 167              struct v3d_job *job, void (*free)(struct kref *ref),
 168              u32 in_sync)
 169  {
 170 +       struct dma_fence *in_fence = NULL;
 171         int ret;
 172
 173         job->v3d = v3d;
 174         job->free = free;
 175
 176 -       ret = drm_syncobj_find_fence(file_priv, in_sync, 0, &job->in_fence);
 177 +       ret = drm_syncobj_find_fence(file_priv, in_sync, 0, &in_fence);
 178         if (ret == -EINVAL)
 179                 return ret;
 180
 181 +       ret = v3d_add_dep(job, in_fence);
 182 +       if (ret)
 183 +               return ret;
 184 +
 185         kref_init(&job->refcount);
 186
 187         return 0;
 188 @@ -590,8 +671,7 @@ v3d_submit_cl_ioctl(struct drm_device *d
 189         if (ret)
 190                 goto fail;
 191
 192 -       ret = v3d_lock_bo_reservations(render->base.bo, render->base.bo_count,
 193 -                                      &acquire_ctx);
 194 +       ret = v3d_lock_bo_reservations(&render->base, &acquire_ctx);
 195         if (ret)
 196                 goto fail;
 197
 198 @@ -601,7 +681,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
 199                 if (ret)
 200                         goto fail_unreserve;
 201
 202 -               render->bin_done_fence = dma_fence_get(bin->base.done_fence);
 203 +               ret = v3d_add_dep(&render->base,
 204 +                                 dma_fence_get(bin->base.done_fence));
 205         }
 206
 207         ret = v3d_push_job(v3d_priv, &render->base, V3D_RENDER);
 208 @@ -692,8 +773,7 @@ v3d_submit_tfu_ioctl(struct drm_device *
 209         }
 210         spin_unlock(&file_priv->table_lock);
 211
 212 -       ret = v3d_lock_bo_reservations(job->base.bo, job->base.bo_count,
 213 -                                      &acquire_ctx);
 214 +       ret = v3d_lock_bo_reservations(&job->base, &acquire_ctx);
 215         if (ret)
 216                 goto fail;
 217
 218 --- a/drivers/gpu/drm/v3d/v3d_sched.c
 219 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
 220 @@ -67,47 +67,10 @@ v3d_job_dependency(struct drm_sched_job
 221                    struct drm_sched_entity *s_entity)
 222  {
 223         struct v3d_job *job = to_v3d_job(sched_job);
 224 -       struct dma_fence *fence;
 225 -
 226 -       fence = job->in_fence;
 227 -       if (fence) {
 228 -               job->in_fence = NULL;
 229 -               return fence;
 230 -       }
 231 -
 232 -       return NULL;
 233 -}
 234
 235 -/**
 236 - * Returns the fences that the render job depends on, one by one.
 237 - * v3d_job_run() won't be called until all of them have been signaled.
 238 - */
 239 -static struct dma_fence *
 240 -v3d_render_job_dependency(struct drm_sched_job *sched_job,
 241 -                         struct drm_sched_entity *s_entity)
 242 -{
 243 -       struct v3d_render_job *job = to_render_job(sched_job);
 244 -       struct dma_fence *fence;
 245 -
 246 -       fence = v3d_job_dependency(sched_job, s_entity);
 247 -       if (fence)
 248 -               return fence;
 249 -
 250 -       /* If we had a bin job, the render job definitely depends on
 251 -        * it. We first have to wait for bin to be scheduled, so that
 252 -        * its done_fence is created.
 253 -        */
 254 -       fence = job->bin_done_fence;
 255 -       if (fence) {
 256 -               job->bin_done_fence = NULL;
 257 -               return fence;
 258 -       }
 259 -
 260 -       /* XXX: Wait on a fence for switching the GMP if necessary,
 261 -        * and then do so.
 262 -        */
 263 -
 264 -       return fence;
 265 +       if (!job->deps_count)
 266 +               return NULL;
 267 +       return job->deps[--job->deps_count];
 268  }
 269
 270  static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
 271 @@ -329,7 +292,7 @@ static const struct drm_sched_backend_op
 272  };
 273
 274  static const struct drm_sched_backend_ops v3d_render_sched_ops = {
 275 -       .dependency = v3d_render_job_dependency,
 276 +       .dependency = v3d_job_dependency,
 277         .run_job = v3d_render_job_run,
 278         .timedout_job = v3d_render_job_timedout,
 279         .free_job = v3d_job_free,