brcm2708: update linux 4.4 patches to latest version
[openwrt/staging/wigyori.git] / target / linux / brcm2708 / patches-4.4 / 0512-drm-vc4-Fix-races-when-the-CS-reads-from-render-targ.patch
1 From 5e074566a22f9fd5107c2c6d4d96a1e9a477294c Mon Sep 17 00:00:00 2001
2 From: Eric Anholt <eric@anholt.net>
3 Date: Tue, 27 Sep 2016 09:03:13 -0700
4 Subject: [PATCH] drm/vc4: Fix races when the CS reads from render targets.
5
6 With the introduction of bin/render pipelining, the previous job may
7 not be completed when we start binning the next one. If the previous
8 job wrote our VBO, IB, or CS textures, then the binning stage might
9 get stale or uninitialized results.
10
11 Fixes the major rendering failure in glmark2 -b terrain.
12
13 Signed-off-by: Eric Anholt <eric@anholt.net>
14 Fixes: ca26d28bbaa3 ("drm/vc4: improve throughput by pipelining binning and rendering jobs")
15 Cc: stable@vger.kernel.org
16 ---
17 drivers/gpu/drm/vc4/vc4_drv.h | 19 ++++++++++++++++++-
18 drivers/gpu/drm/vc4/vc4_gem.c | 13 +++++++++++++
19 drivers/gpu/drm/vc4/vc4_render_cl.c | 21 +++++++++++++++++----
20 drivers/gpu/drm/vc4/vc4_validate.c | 17 ++++++++++++++---
21 4 files changed, 62 insertions(+), 8 deletions(-)
22
23 --- a/drivers/gpu/drm/vc4/vc4_drv.h
24 +++ b/drivers/gpu/drm/vc4/vc4_drv.h
25 @@ -129,9 +129,16 @@ to_vc4_dev(struct drm_device *dev)
26 struct vc4_bo {
27 struct drm_gem_cma_object base;
28
29 - /* seqno of the last job to render to this BO. */
30 + /* seqno of the last job to render using this BO. */
31 uint64_t seqno;
32
33 + /* seqno of the last job to use the RCL to write to this BO.
34 + *
35 + * Note that this doesn't include binner overflow memory
36 + * writes.
37 + */
38 + uint64_t write_seqno;
39 +
40 /* List entry for the BO's position in either
41 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
42 */
43 @@ -227,6 +234,9 @@ struct vc4_exec_info {
44 /* Sequence number for this bin/render job. */
45 uint64_t seqno;
46
47 + /* Latest write_seqno of any BO that binning depends on. */
48 + uint64_t bin_dep_seqno;
49 +
50 /* Last current addresses the hardware was processing when the
51 * hangcheck timer checked on us.
52 */
53 @@ -241,6 +251,13 @@ struct vc4_exec_info {
54 struct drm_gem_cma_object **bo;
55 uint32_t bo_count;
56
57 + /* List of BOs that are being written by the RCL. Other than
58 + * the binner temporary storage, this is all the BOs written
59 + * by the job.
60 + */
61 + struct drm_gem_cma_object *rcl_write_bo[4];
62 + uint32_t rcl_write_bo_count;
63 +
64 /* Pointers for our position in vc4->job_list */
65 struct list_head head;
66
67 --- a/drivers/gpu/drm/vc4/vc4_gem.c
68 +++ b/drivers/gpu/drm/vc4/vc4_gem.c
69 @@ -483,6 +483,11 @@ vc4_update_bo_seqnos(struct vc4_exec_inf
70 list_for_each_entry(bo, &exec->unref_list, unref_head) {
71 bo->seqno = seqno;
72 }
73 +
74 + for (i = 0; i < exec->rcl_write_bo_count; i++) {
75 + bo = to_vc4_bo(&exec->rcl_write_bo[i]->base);
76 + bo->write_seqno = seqno;
77 + }
78 }
79
80 /* Queues a struct vc4_exec_info for execution. If no job is
81 @@ -685,6 +690,14 @@ vc4_get_bcl(struct drm_device *dev, stru
82 goto fail;
83
84 ret = vc4_validate_shader_recs(dev, exec);
85 + if (ret)
86 + goto fail;
87 +
88 + /* Block waiting on any previous rendering into the CS's VBO,
89 + * IB, or textures, so that pixels are actually written by the
90 + * time we try to read them.
91 + */
92 + ret = vc4_wait_for_seqno(dev, exec->bin_dep_seqno, ~0ull, true);
93
94 fail:
95 kfree(temp);
96 --- a/drivers/gpu/drm/vc4/vc4_render_cl.c
97 +++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
98 @@ -45,6 +45,8 @@ struct vc4_rcl_setup {
99
100 struct drm_gem_cma_object *rcl;
101 u32 next_offset;
102 +
103 + u32 next_write_bo_index;
104 };
105
106 static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val)
107 @@ -407,6 +409,8 @@ static int vc4_rcl_msaa_surface_setup(st
108 if (!*obj)
109 return -EINVAL;
110
111 + exec->rcl_write_bo[exec->rcl_write_bo_count++] = *obj;
112 +
113 if (surf->offset & 0xf) {
114 DRM_ERROR("MSAA write must be 16b aligned.\n");
115 return -EINVAL;
116 @@ -417,7 +421,8 @@ static int vc4_rcl_msaa_surface_setup(st
117
118 static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
119 struct drm_gem_cma_object **obj,
120 - struct drm_vc4_submit_rcl_surface *surf)
121 + struct drm_vc4_submit_rcl_surface *surf,
122 + bool is_write)
123 {
124 uint8_t tiling = VC4_GET_FIELD(surf->bits,
125 VC4_LOADSTORE_TILE_BUFFER_TILING);
126 @@ -440,6 +445,9 @@ static int vc4_rcl_surface_setup(struct
127 if (!*obj)
128 return -EINVAL;
129
130 + if (is_write)
131 + exec->rcl_write_bo[exec->rcl_write_bo_count++] = *obj;
132 +
133 if (surf->flags & VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
134 if (surf == &exec->args->zs_write) {
135 DRM_ERROR("general zs write may not be a full-res.\n");
136 @@ -542,6 +550,8 @@ vc4_rcl_render_config_surface_setup(stru
137 if (!*obj)
138 return -EINVAL;
139
140 + exec->rcl_write_bo[exec->rcl_write_bo_count++] = *obj;
141 +
142 if (tiling > VC4_TILING_FORMAT_LT) {
143 DRM_ERROR("Bad tiling format\n");
144 return -EINVAL;
145 @@ -599,15 +609,18 @@ int vc4_get_rcl(struct drm_device *dev,
146 if (ret)
147 return ret;
148
149 - ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
150 + ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read,
151 + false);
152 if (ret)
153 return ret;
154
155 - ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read);
156 + ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read,
157 + false);
158 if (ret)
159 return ret;
160
161 - ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write);
162 + ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write,
163 + true);
164 if (ret)
165 return ret;
166
167 --- a/drivers/gpu/drm/vc4/vc4_validate.c
168 +++ b/drivers/gpu/drm/vc4/vc4_validate.c
169 @@ -267,6 +267,9 @@ validate_indexed_prim_list(VALIDATE_ARGS
170 if (!ib)
171 return -EINVAL;
172
173 + exec->bin_dep_seqno = max(exec->bin_dep_seqno,
174 + to_vc4_bo(&ib->base)->write_seqno);
175 +
176 if (offset > ib->base.size ||
177 (ib->base.size - offset) / index_size < length) {
178 DRM_ERROR("IB access overflow (%d + %d*%d > %zd)\n",
179 @@ -555,8 +558,7 @@ static bool
180 reloc_tex(struct vc4_exec_info *exec,
181 void *uniform_data_u,
182 struct vc4_texture_sample_info *sample,
183 - uint32_t texture_handle_index)
184 -
185 + uint32_t texture_handle_index, bool is_cs)
186 {
187 struct drm_gem_cma_object *tex;
188 uint32_t p0 = *(uint32_t *)(uniform_data_u + sample->p_offset[0]);
189 @@ -714,6 +716,11 @@ reloc_tex(struct vc4_exec_info *exec,
190
191 *validated_p0 = tex->paddr + p0;
192
193 + if (is_cs) {
194 + exec->bin_dep_seqno = max(exec->bin_dep_seqno,
195 + to_vc4_bo(&tex->base)->write_seqno);
196 + }
197 +
198 return true;
199 fail:
200 DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
201 @@ -835,7 +842,8 @@ validate_gl_shader_rec(struct drm_device
202 if (!reloc_tex(exec,
203 uniform_data_u,
204 &validated_shader->texture_samples[tex],
205 - texture_handles_u[tex])) {
206 + texture_handles_u[tex],
207 + i == 2)) {
208 return -EINVAL;
209 }
210 }
211 @@ -867,6 +875,9 @@ validate_gl_shader_rec(struct drm_device
212 uint32_t stride = *(uint8_t *)(pkt_u + o + 5);
213 uint32_t max_index;
214
215 + exec->bin_dep_seqno = max(exec->bin_dep_seqno,
216 + to_vc4_bo(&vbo->base)->write_seqno);
217 +
218 if (state->addr & 0x8)
219 stride |= (*(uint32_t *)(pkt_u + 100 + i * 4)) & ~0xff;
220