brcm2708: update linux 4.4 patches to latest version
[openwrt/openwrt.git] / target / linux / brcm2708 / patches-4.4 / 0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch
diff --git a/target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch b/target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch
new file mode 100644 (file)
index 0000000..11f7bd2
--- /dev/null
@@ -0,0 +1,429 @@
+From 346367864363fa323ee502d9e8fb36b964cbbdb0 Mon Sep 17 00:00:00 2001
+From: Varad Gautam <varadgautam@gmail.com>
+Date: Wed, 17 Feb 2016 19:08:21 +0530
+Subject: [PATCH 290/423] drm/vc4: improve throughput by pipelining binning and
+ rendering jobs
+
+The hardware provides us with separate threads for binning and
+rendering, and the existing model waits for them both to complete
+before submitting the next job.
+
+Splitting the binning and rendering submissions reduces idle time and
+gives us approx 20-30% speedup with some x11perf tests such as -line10
+and -tilerect1.  Improves openarena performance by 1.01897% +/-
+0.247857% (n=16).
+
+Thanks to anholt for suggesting this.
+
+v2: Rebase on the spurious resets fix (change by anholt).
+
+Signed-off-by: Varad Gautam <varadgautam@gmail.com>
+Reviewed-by: Eric Anholt <eric@anholt.net>
+Signed-off-by: Eric Anholt <eric@anholt.net>
+(cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207)
+---
+ drivers/gpu/drm/vc4/vc4_drv.h |  37 +++++++++----
+ drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------
+ drivers/gpu/drm/vc4/vc4_irq.c |  58 ++++++++++++++++----
+ 3 files changed, 166 insertions(+), 52 deletions(-)
+
+--- a/drivers/gpu/drm/vc4/vc4_drv.h
++++ b/drivers/gpu/drm/vc4/vc4_drv.h
+@@ -53,7 +53,7 @@ struct vc4_dev {
+       /* Protects bo_cache and the BO stats. */
+       struct mutex bo_lock;
+-      /* Sequence number for the last job queued in job_list.
++      /* Sequence number for the last job queued in bin_job_list.
+        * Starts at 0 (no jobs emitted).
+        */
+       uint64_t emit_seqno;
+@@ -63,11 +63,19 @@ struct vc4_dev {
+        */
+       uint64_t finished_seqno;
+-      /* List of all struct vc4_exec_info for jobs to be executed.
+-       * The first job in the list is the one currently programmed
+-       * into ct0ca/ct1ca for execution.
++      /* List of all struct vc4_exec_info for jobs to be executed in
++       * the binner.  The first job in the list is the one currently
++       * programmed into ct0ca for execution.
++       */
++      struct list_head bin_job_list;
++
++      /* List of all struct vc4_exec_info for jobs that have
++       * completed binning and are ready for rendering.  The first
++       * job in the list is the one currently programmed into ct1ca
++       * for execution.
+        */
+-      struct list_head job_list;
++      struct list_head render_job_list;
++
+       /* List of the finished vc4_exec_infos waiting to be freed by
+        * job_done_work.
+        */
+@@ -291,11 +299,20 @@ struct vc4_exec_info {
+ };
+ static inline struct vc4_exec_info *
+-vc4_first_job(struct vc4_dev *vc4)
++vc4_first_bin_job(struct vc4_dev *vc4)
++{
++      if (list_empty(&vc4->bin_job_list))
++              return NULL;
++      return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head);
++}
++
++static inline struct vc4_exec_info *
++vc4_first_render_job(struct vc4_dev *vc4)
+ {
+-      if (list_empty(&vc4->job_list))
++      if (list_empty(&vc4->render_job_list))
+               return NULL;
+-      return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
++      return list_first_entry(&vc4->render_job_list,
++                              struct vc4_exec_info, head);
+ }
+ /**
+@@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi
+                        struct drm_file *file_priv);
+ int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
+                     struct drm_file *file_priv);
+-void vc4_submit_next_job(struct drm_device *dev);
++void vc4_submit_next_bin_job(struct drm_device *dev);
++void vc4_submit_next_render_job(struct drm_device *dev);
++void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec);
+ int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
+                      uint64_t timeout_ns, bool interruptible);
+ void vc4_job_handle_completed(struct vc4_dev *vc4);
+--- a/drivers/gpu/drm/vc4/vc4_gem.c
++++ b/drivers/gpu/drm/vc4/vc4_gem.c
+@@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
+       struct drm_vc4_get_hang_state *state;
+       struct vc4_hang_state *kernel_state;
+-      struct vc4_exec_info *exec;
++      struct vc4_exec_info *exec[2];
+       struct vc4_bo *bo;
+       unsigned long irqflags;
+-      unsigned int i, unref_list_count;
++      unsigned int i, j, unref_list_count, prev_idx;
+       kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
+       if (!kernel_state)
+@@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d
+       state = &kernel_state->user_state;
+       spin_lock_irqsave(&vc4->job_lock, irqflags);
+-      exec = vc4_first_job(vc4);
+-      if (!exec) {
++      exec[0] = vc4_first_bin_job(vc4);
++      exec[1] = vc4_first_render_job(vc4);
++      if (!exec[0] && !exec[1]) {
+               spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+               return;
+       }
+-      unref_list_count = 0;
+-      list_for_each_entry(bo, &exec->unref_list, unref_head)
+-              unref_list_count++;
+-
+-      state->bo_count = exec->bo_count + unref_list_count;
+-      kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
+-                                 GFP_ATOMIC);
++      /* Get the bos from both binner and renderer into hang state. */
++      state->bo_count = 0;
++      for (i = 0; i < 2; i++) {
++              if (!exec[i])
++                      continue;
++
++              unref_list_count = 0;
++              list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
++                      unref_list_count++;
++              state->bo_count += exec[i]->bo_count + unref_list_count;
++      }
++
++      kernel_state->bo = kcalloc(state->bo_count,
++                                 sizeof(*kernel_state->bo), GFP_ATOMIC);
++
+       if (!kernel_state->bo) {
+               spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+               return;
+       }
+-      for (i = 0; i < exec->bo_count; i++) {
+-              drm_gem_object_reference(&exec->bo[i]->base);
+-              kernel_state->bo[i] = &exec->bo[i]->base;
+-      }
++      prev_idx = 0;
++      for (i = 0; i < 2; i++) {
++              if (!exec[i])
++                      continue;
++
++              for (j = 0; j < exec[i]->bo_count; j++) {
++                      drm_gem_object_reference(&exec[i]->bo[j]->base);
++                      kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
++              }
+-      list_for_each_entry(bo, &exec->unref_list, unref_head) {
+-              drm_gem_object_reference(&bo->base.base);
+-              kernel_state->bo[i] = &bo->base.base;
+-              i++;
++              list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
++                      drm_gem_object_reference(&bo->base.base);
++                      kernel_state->bo[j + prev_idx] = &bo->base.base;
++                      j++;
++              }
++              prev_idx = j + 1;
+       }
+-      state->start_bin = exec->ct0ca;
+-      state->start_render = exec->ct1ca;
++      if (exec[0])
++              state->start_bin = exec[0]->ct0ca;
++      if (exec[1])
++              state->start_render = exec[1]->ct1ca;
+       spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+@@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
+       uint32_t ct0ca, ct1ca;
+       unsigned long irqflags;
+-      struct vc4_exec_info *exec;
++      struct vc4_exec_info *bin_exec, *render_exec;
+       spin_lock_irqsave(&vc4->job_lock, irqflags);
+-      exec = vc4_first_job(vc4);
++
++      bin_exec = vc4_first_bin_job(vc4);
++      render_exec = vc4_first_render_job(vc4);
+       /* If idle, we can stop watching for hangs. */
+-      if (!exec) {
++      if (!bin_exec && !render_exec) {
+               spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+               return;
+       }
+@@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data
+       /* If we've made any progress in execution, rearm the timer
+        * and wait.
+        */
+-      if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) {
+-              exec->last_ct0ca = ct0ca;
+-              exec->last_ct1ca = ct1ca;
++      if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
++          (render_exec && ct1ca != render_exec->last_ct1ca)) {
++              if (bin_exec)
++                      bin_exec->last_ct0ca = ct0ca;
++              if (render_exec)
++                      render_exec->last_ct1ca = ct1ca;
+               spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+               vc4_queue_hangcheck(dev);
+               return;
+@@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev)
+  * The job_lock should be held during this.
+  */
+ void
+-vc4_submit_next_job(struct drm_device *dev)
++vc4_submit_next_bin_job(struct drm_device *dev)
+ {
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
+-      struct vc4_exec_info *exec = vc4_first_job(vc4);
++      struct vc4_exec_info *exec;
++again:
++      exec = vc4_first_bin_job(vc4);
+       if (!exec)
+               return;
+@@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d
+       V3D_WRITE(V3D_BPOA, 0);
+       V3D_WRITE(V3D_BPOS, 0);
+-      if (exec->ct0ca != exec->ct0ea)
++      /* Either put the job in the binner if it uses the binner, or
++       * immediately move it to the to-be-rendered queue.
++       */
++      if (exec->ct0ca != exec->ct0ea) {
+               submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
++      } else {
++              vc4_move_job_to_render(dev, exec);
++              goto again;
++      }
++}
++
++void
++vc4_submit_next_render_job(struct drm_device *dev)
++{
++      struct vc4_dev *vc4 = to_vc4_dev(dev);
++      struct vc4_exec_info *exec = vc4_first_render_job(vc4);
++
++      if (!exec)
++              return;
++
+       submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
+ }
++void
++vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
++{
++      struct vc4_dev *vc4 = to_vc4_dev(dev);
++      bool was_empty = list_empty(&vc4->render_job_list);
++
++      list_move_tail(&exec->head, &vc4->render_job_list);
++      if (was_empty)
++              vc4_submit_next_render_job(dev);
++}
++
+ static void
+ vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
+ {
+@@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev,
+       exec->seqno = seqno;
+       vc4_update_bo_seqnos(exec, seqno);
+-      list_add_tail(&exec->head, &vc4->job_list);
++      list_add_tail(&exec->head, &vc4->bin_job_list);
+       /* If no job was executing, kick ours off.  Otherwise, it'll
+-       * get started when the previous job's frame done interrupt
++       * get started when the previous job's flush done interrupt
+        * occurs.
+        */
+-      if (vc4_first_job(vc4) == exec) {
+-              vc4_submit_next_job(dev);
++      if (vc4_first_bin_job(vc4) == exec) {
++              vc4_submit_next_bin_job(dev);
+               vc4_queue_hangcheck(dev);
+       }
+@@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev)
+ {
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
+-      INIT_LIST_HEAD(&vc4->job_list);
++      INIT_LIST_HEAD(&vc4->bin_job_list);
++      INIT_LIST_HEAD(&vc4->render_job_list);
+       INIT_LIST_HEAD(&vc4->job_done_list);
+       INIT_LIST_HEAD(&vc4->seqno_cb_list);
+       spin_lock_init(&vc4->job_lock);
+--- a/drivers/gpu/drm/vc4/vc4_irq.c
++++ b/drivers/gpu/drm/vc4/vc4_irq.c
+@@ -30,6 +30,10 @@
+  * disables that specific interrupt, and 0s written are ignored
+  * (reading either one returns the set of enabled interrupts).
+  *
++ * When we take a binning flush done interrupt, we need to submit the
++ * next frame for binning and move the finished frame to the render
++ * thread.
++ *
+  * When we take a render frame interrupt, we need to wake the
+  * processes waiting for some frame to be done, and get the next frame
+  * submitted ASAP (so the hardware doesn't sit idle when there's work
+@@ -44,6 +48,7 @@
+ #include "vc4_regs.h"
+ #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
++                       V3D_INT_FLDONE | \
+                        V3D_INT_FRDONE)
+ DECLARE_WAIT_QUEUE_HEAD(render_wait);
+@@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct
+               unsigned long irqflags;
+               spin_lock_irqsave(&vc4->job_lock, irqflags);
+-              current_exec = vc4_first_job(vc4);
++              current_exec = vc4_first_bin_job(vc4);
+               if (current_exec) {
+                       vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
+                       list_add_tail(&vc4->overflow_mem->unref_head,
+@@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct
+ }
+ static void
+-vc4_irq_finish_job(struct drm_device *dev)
++vc4_irq_finish_bin_job(struct drm_device *dev)
++{
++      struct vc4_dev *vc4 = to_vc4_dev(dev);
++      struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
++
++      if (!exec)
++              return;
++
++      vc4_move_job_to_render(dev, exec);
++      vc4_submit_next_bin_job(dev);
++}
++
++static void
++vc4_cancel_bin_job(struct drm_device *dev)
++{
++      struct vc4_dev *vc4 = to_vc4_dev(dev);
++      struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
++
++      if (!exec)
++              return;
++
++      list_move_tail(&exec->head, &vc4->bin_job_list);
++      vc4_submit_next_bin_job(dev);
++}
++
++static void
++vc4_irq_finish_render_job(struct drm_device *dev)
+ {
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
+-      struct vc4_exec_info *exec = vc4_first_job(vc4);
++      struct vc4_exec_info *exec = vc4_first_render_job(vc4);
+       if (!exec)
+               return;
+       vc4->finished_seqno++;
+       list_move_tail(&exec->head, &vc4->job_done_list);
+-      vc4_submit_next_job(dev);
++      vc4_submit_next_render_job(dev);
+       wake_up_all(&vc4->job_wait_queue);
+       schedule_work(&vc4->job_done_work);
+@@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg)
+       barrier();
+       intctl = V3D_READ(V3D_INTCTL);
+-      /* Acknowledge the interrupts we're handling here. The render
+-       * frame done interrupt will be cleared, while OUTOMEM will
+-       * stay high until the underlying cause is cleared.
++      /* Acknowledge the interrupts we're handling here. The binner
++       * last flush / render frame done interrupt will be cleared,
++       * while OUTOMEM will stay high until the underlying cause is
++       * cleared.
+        */
+       V3D_WRITE(V3D_INTCTL, intctl);
+@@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg)
+               status = IRQ_HANDLED;
+       }
++      if (intctl & V3D_INT_FLDONE) {
++              spin_lock(&vc4->job_lock);
++              vc4_irq_finish_bin_job(dev);
++              spin_unlock(&vc4->job_lock);
++              status = IRQ_HANDLED;
++      }
++
+       if (intctl & V3D_INT_FRDONE) {
+               spin_lock(&vc4->job_lock);
+-              vc4_irq_finish_job(dev);
++              vc4_irq_finish_render_job(dev);
+               spin_unlock(&vc4->job_lock);
+               status = IRQ_HANDLED;
+       }
+@@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de
+       V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
+       spin_lock_irqsave(&vc4->job_lock, irqflags);
+-      vc4_irq_finish_job(dev);
++      vc4_cancel_bin_job(dev);
++      vc4_irq_finish_render_job(dev);
+       spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+ }