1 From 7713f79b0a5473eb0b8456d36b99ae00815dd8a1 Mon Sep 17 00:00:00 2001
2 From: Eric Anholt <eric@anholt.net>
3 Date: Wed, 27 Mar 2019 17:44:40 -0700
4 Subject: [PATCH 586/806] drm/v3d: Add missing implicit synchronization.
6 It is the expectation of existing userspace (X11 + Mesa, in
7 particular) that jobs submitted to the kernel against a shared BO will
8 get implicitly synchronized by their submission order. If we want to
9 allow clever userspace to disable implicit synchronization, we should
10 do that under its own submit flag (as amdgpu and lima do).
12 Note that we currently only implicitly sync for the rendering pass,
13 not binning -- if you texture-from-pixmap in the binning vertex shader
14 (vertex coordinate generation), you'll miss out on synchronization.
16 Fixes flickering when multiple clients are running in parallel,
17 particularly GL apps and compositors.
19 Signed-off-by: Eric Anholt <eric@anholt.net>
21 drivers/gpu/drm/v3d/v3d_drv.h | 10 +---
22 drivers/gpu/drm/v3d/v3d_gem.c | 98 ++++++++++++++++++++++++++++++---
23 drivers/gpu/drm/v3d/v3d_sched.c | 45 ++-------------
24 3 files changed, 96 insertions(+), 57 deletions(-)
26 --- a/drivers/gpu/drm/v3d/v3d_drv.h
27 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
28 @@ -186,8 +186,9 @@ struct v3d_job {
32 - /* An optional fence userspace can pass in for the job to depend on. */
33 - struct dma_fence *in_fence;
34 + struct dma_fence **deps;
38 /* v3d fence to be signaled by IRQ handler when the job is complete. */
39 struct dma_fence *irq_fence;
40 @@ -219,11 +220,6 @@ struct v3d_bin_job {
41 struct v3d_render_job {
44 - /* Optional fence for the binner, to depend on before starting
47 - struct dma_fence *bin_done_fence;
49 /* GPU virtual addresses of the start/end of the CL job. */
52 --- a/drivers/gpu/drm/v3d/v3d_gem.c
53 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
54 @@ -218,6 +218,71 @@ v3d_unlock_bo_reservations(struct v3d_bo
55 ww_acquire_fini(acquire_ctx);
59 +v3d_add_dep(struct v3d_job *job, struct dma_fence *fence)
64 + if (job->deps_size == job->deps_count) {
65 + int new_deps_size = max(job->deps_size * 2, 4);
66 + struct dma_fence **new_deps =
67 + krealloc(job->deps, new_deps_size * sizeof(*new_deps),
70 + dma_fence_put(fence);
74 + job->deps = new_deps;
75 + job->deps_size = new_deps_size;
78 + job->deps[job->deps_count++] = fence;
84 + * Adds the required implicit fences before executing the job
86 + * Userspace (X11 + Mesa) requires that a job submitted against a shared BO
87 + * from one fd will implicitly synchronize against previous jobs submitted
88 + * against that BO from other fds.
90 + * Currently we don't bother trying to track the shared BOs, and instead just
91 + * sync everything. However, our synchronization is only for the render pass
92 + * -- the binning stage (VS coordinate calculations) ignores implicit sync,
93 + * since using shared buffers for texture coordinates seems unlikely, and
94 + * implicitly syncing them would break bin/render parallelism. If we want to
95 + * fix that, we should introduce a flag when VS texturing has been used in the
96 + * binning stage, or a set of flags for which BOs are sampled during binning.
99 +v3d_add_implicit_fences(struct v3d_job *job, struct v3d_bo *bo)
101 + int i, ret, nr_fences;
102 + struct dma_fence **fences;
104 + ret = reservation_object_get_fences_rcu(bo->resv, NULL,
105 + &nr_fences, &fences);
106 + if (ret || !nr_fences)
109 + for (i = 0; i < nr_fences; i++) {
110 + ret = v3d_add_dep(job, fences[i]);
115 + /* Free any remaining fences after error. */
116 + for (; i < nr_fences; i++)
117 + dma_fence_put(fences[i]);
123 /* Takes the reservation lock on all the BOs being referenced, so that
124 * at queue submit time we can update the reservations.
126 @@ -226,10 +291,11 @@ v3d_unlock_bo_reservations(struct v3d_bo
127 * to v3d, so we don't attach dma-buf fences to them.
130 -v3d_lock_bo_reservations(struct v3d_bo **bos,
132 +v3d_lock_bo_reservations(struct v3d_job *job,
133 struct ww_acquire_ctx *acquire_ctx)
135 + struct v3d_bo **bos = job->bo;
136 + int bo_count = job->bo_count;
137 int contended_lock = -1;
140 @@ -281,6 +347,13 @@ retry:
141 * before we commit the CL to the hardware.
143 for (i = 0; i < bo_count; i++) {
144 + ret = v3d_add_implicit_fences(job, bos[i]);
146 + v3d_unlock_bo_reservations(bos, bo_count,
151 ret = reservation_object_reserve_shared(bos[i]->resv);
153 v3d_unlock_bo_reservations(bos, bo_count,
154 @@ -383,7 +456,10 @@ v3d_job_free(struct kref *ref)
158 - dma_fence_put(job->in_fence);
159 + for (i = 0; i < job->deps_count; i++)
160 + dma_fence_put(job->deps[i]);
163 dma_fence_put(job->irq_fence);
164 dma_fence_put(job->done_fence);
166 @@ -464,15 +540,20 @@ v3d_job_init(struct v3d_dev *v3d, struct
167 struct v3d_job *job, void (*free)(struct kref *ref),
170 + struct dma_fence *in_fence = NULL;
176 - ret = drm_syncobj_find_fence(file_priv, in_sync, 0, &job->in_fence);
177 + ret = drm_syncobj_find_fence(file_priv, in_sync, 0, &in_fence);
181 + ret = v3d_add_dep(job, in_fence);
185 kref_init(&job->refcount);
188 @@ -590,8 +671,7 @@ v3d_submit_cl_ioctl(struct drm_device *d
192 - ret = v3d_lock_bo_reservations(render->base.bo, render->base.bo_count,
194 + ret = v3d_lock_bo_reservations(&render->base, &acquire_ctx);
198 @@ -601,7 +681,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
202 - render->bin_done_fence = dma_fence_get(bin->base.done_fence);
203 + ret = v3d_add_dep(&render->base,
204 + dma_fence_get(bin->base.done_fence));
207 ret = v3d_push_job(v3d_priv, &render->base, V3D_RENDER);
208 @@ -692,8 +773,7 @@ v3d_submit_tfu_ioctl(struct drm_device *
210 spin_unlock(&file_priv->table_lock);
212 - ret = v3d_lock_bo_reservations(job->base.bo, job->base.bo_count,
214 + ret = v3d_lock_bo_reservations(&job->base, &acquire_ctx);
218 --- a/drivers/gpu/drm/v3d/v3d_sched.c
219 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
220 @@ -67,47 +67,10 @@ v3d_job_dependency(struct drm_sched_job
221 struct drm_sched_entity *s_entity)
223 struct v3d_job *job = to_v3d_job(sched_job);
224 - struct dma_fence *fence;
226 - fence = job->in_fence;
228 - job->in_fence = NULL;
236 - * Returns the fences that the render job depends on, one by one.
237 - * v3d_job_run() won't be called until all of them have been signaled.
239 -static struct dma_fence *
240 -v3d_render_job_dependency(struct drm_sched_job *sched_job,
241 - struct drm_sched_entity *s_entity)
243 - struct v3d_render_job *job = to_render_job(sched_job);
244 - struct dma_fence *fence;
246 - fence = v3d_job_dependency(sched_job, s_entity);
250 - /* If we had a bin job, the render job definitely depends on
251 - * it. We first have to wait for bin to be scheduled, so that
252 - * its done_fence is created.
254 - fence = job->bin_done_fence;
256 - job->bin_done_fence = NULL;
260 - /* XXX: Wait on a fence for switching the GMP if necessary,
265 + if (!job->deps_count)
267 + return job->deps[--job->deps_count];
270 static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
271 @@ -329,7 +292,7 @@ static const struct drm_sched_backend_op
274 static const struct drm_sched_backend_ops v3d_render_sched_ops = {
275 - .dependency = v3d_render_job_dependency,
276 + .dependency = v3d_job_dependency,
277 .run_job = v3d_render_job_run,
278 .timedout_job = v3d_render_job_timedout,
279 .free_job = v3d_job_free,