brcm2708: organize kernel patches
[openwrt/staging/dedeckeh.git] / target / linux / brcm2708 / patches-4.19 / 950-0698-Ported-pcie-brcmstb-bounce-buffer-implementation-to-.patch
diff --git a/target/linux/brcm2708/patches-4.19/950-0698-Ported-pcie-brcmstb-bounce-buffer-implementation-to-.patch b/target/linux/brcm2708/patches-4.19/950-0698-Ported-pcie-brcmstb-bounce-buffer-implementation-to-.patch
new file mode 100644 (file)
index 0000000..c69a8ca
--- /dev/null
@@ -0,0 +1,818 @@
+From ccd23ce562e8223ba7c6acf7dcb7058ff89ff7ec Mon Sep 17 00:00:00 2001
+From: yaroslavros <yaroslavros@gmail.com>
+Date: Wed, 14 Aug 2019 15:22:55 +0100
+Subject: [PATCH] Ported pcie-brcmstb bounce buffer implementation to
+ ARM64. (#3144)
+
+Ported pcie-brcmstb bounce buffer implementation to ARM64.
+This enables full 4G RAM usage on Raspberry Pi in 64-bit mode.
+
+Signed-off-by: Yaroslav Rosomakho <yaroslavros@gmail.com>
+---
+ arch/arm64/include/asm/dma-mapping.h          |  21 +
+ arch/arm64/mm/dma-mapping.c                   |  50 ++
+ drivers/pci/controller/Makefile               |   3 +
+ drivers/pci/controller/pcie-brcmstb-bounce.h  |   2 +-
+ .../pci/controller/pcie-brcmstb-bounce64.c    | 576 ++++++++++++++++++
+ drivers/pci/controller/pcie-brcmstb.c         |  30 +-
+ 6 files changed, 658 insertions(+), 24 deletions(-)
+ create mode 100644 drivers/pci/controller/pcie-brcmstb-bounce64.c
+
+--- a/arch/arm64/include/asm/dma-mapping.h
++++ b/arch/arm64/include/asm/dma-mapping.h
+@@ -24,6 +24,27 @@
+ #include <xen/xen.h>
+ #include <asm/xen/hypervisor.h>
++extern void *arm64_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
++                         gfp_t gfp, unsigned long attrs);
++extern void arm64_dma_free(struct device *dev, size_t size, void *cpu_addr,
++                       dma_addr_t handle, unsigned long attrs);
++extern int arm64_dma_mmap(struct device *dev, struct vm_area_struct *vma,
++                      void *cpu_addr, dma_addr_t dma_addr, size_t size,
++                      unsigned long attrs);
++extern int arm64_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
++              void *cpu_addr, dma_addr_t dma_addr, size_t size,
++              unsigned long attrs);
++extern int arm64_dma_map_sg(struct device *dev, struct scatterlist *sgl, int nelems,
++              enum dma_data_direction dir, unsigned long attrs);
++extern void arm64_dma_unmap_sg(struct device *dev, struct scatterlist *sgl, int,
++              enum dma_data_direction dir, unsigned long attrs);
++extern void arm64_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nelems,
++              enum dma_data_direction dir);
++extern void arm64_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems,
++              enum dma_data_direction dir);
++
++
++
+ extern const struct dma_map_ops dummy_dma_ops;
+ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+--- a/arch/arm64/mm/dma-mapping.c
++++ b/arch/arm64/mm/dma-mapping.c
+@@ -138,6 +138,12 @@ no_mem:
+       return NULL;
+ }
++void *arm64_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
++                           gfp_t gfp, unsigned long attrs)
++{
++        return __dma_alloc(dev, size, handle, gfp, attrs);
++}
++
+ static void __dma_free(struct device *dev, size_t size,
+                      void *vaddr, dma_addr_t dma_handle,
+                      unsigned long attrs)
+@@ -154,6 +160,12 @@ static void __dma_free(struct device *de
+       swiotlb_free(dev, size, swiotlb_addr, dma_handle, attrs);
+ }
++void arm64_dma_free(struct device *dev, size_t size, void *cpu_addr,
++                         dma_addr_t handle, unsigned long attrs)
++{
++        __dma_free(dev, size, cpu_addr, handle, attrs);
++}
++
+ static dma_addr_t __swiotlb_map_page(struct device *dev, struct page *page,
+                                    unsigned long offset, size_t size,
+                                    enum dma_data_direction dir,
+@@ -197,6 +209,12 @@ static int __swiotlb_map_sg_attrs(struct
+       return ret;
+ }
++int arm64_dma_map_sg(struct device *dev, struct scatterlist *sgl, int nelems,
++                enum dma_data_direction dir, unsigned long attrs)
++{
++      return __swiotlb_map_sg_attrs(dev, sgl, nelems, dir, attrs);
++}
++
+ static void __swiotlb_unmap_sg_attrs(struct device *dev,
+                                    struct scatterlist *sgl, int nelems,
+                                    enum dma_data_direction dir,
+@@ -213,6 +231,12 @@ static void __swiotlb_unmap_sg_attrs(str
+       swiotlb_unmap_sg_attrs(dev, sgl, nelems, dir, attrs);
+ }
++void arm64_dma_unmap_sg(struct device *dev, struct scatterlist *sgl, int nelems,
++                enum dma_data_direction dir, unsigned long attrs)
++{
++      __swiotlb_unmap_sg_attrs(dev, sgl, nelems, dir, attrs);
++}
++
+ static void __swiotlb_sync_single_for_cpu(struct device *dev,
+                                         dma_addr_t dev_addr, size_t size,
+                                         enum dma_data_direction dir)
+@@ -245,6 +269,12 @@ static void __swiotlb_sync_sg_for_cpu(st
+       swiotlb_sync_sg_for_cpu(dev, sgl, nelems, dir);
+ }
++void arm64_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nelems,
++                enum dma_data_direction dir)
++{
++      __swiotlb_sync_sg_for_cpu(dev, sgl, nelems, dir);
++}
++
+ static void __swiotlb_sync_sg_for_device(struct device *dev,
+                                        struct scatterlist *sgl, int nelems,
+                                        enum dma_data_direction dir)
+@@ -259,6 +289,12 @@ static void __swiotlb_sync_sg_for_device
+                                      sg->length, dir);
+ }
++void arm64_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems,
++                enum dma_data_direction dir)
++{
++      __swiotlb_sync_sg_for_device(dev, sgl, nelems, dir);
++}
++
+ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
+                             unsigned long pfn, size_t size)
+ {
+@@ -294,6 +330,13 @@ static int __swiotlb_mmap(struct device
+       return __swiotlb_mmap_pfn(vma, pfn, size);
+ }
++int arm64_dma_mmap(struct device *dev, struct vm_area_struct *vma,
++                        void *cpu_addr, dma_addr_t dma_addr, size_t size,
++                        unsigned long attrs)
++{
++      return __swiotlb_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
++}
++
+ static int __swiotlb_get_sgtable_page(struct sg_table *sgt,
+                                     struct page *page, size_t size)
+ {
+@@ -314,6 +357,13 @@ static int __swiotlb_get_sgtable(struct
+       return __swiotlb_get_sgtable_page(sgt, page, size);
+ }
++int arm64_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
++                void *cpu_addr, dma_addr_t dma_addr, size_t size,
++                unsigned long attrs)
++{
++      return __swiotlb_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
++}
++
+ static int __swiotlb_dma_supported(struct device *hwdev, u64 mask)
+ {
+       if (swiotlb)
+--- a/drivers/pci/controller/Makefile
++++ b/drivers/pci/controller/Makefile
+@@ -32,6 +32,9 @@ obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcms
+ ifdef CONFIG_ARM
+ obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb-bounce.o
+ endif
++ifdef CONFIG_ARM64
++obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb-bounce64.o
++endif
+ obj-$(CONFIG_VMD) += vmd.o
+ # pcie-hisi.o quirks are needed even without CONFIG_PCIE_DW
+--- a/drivers/pci/controller/pcie-brcmstb-bounce.h
++++ b/drivers/pci/controller/pcie-brcmstb-bounce.h
+@@ -6,7 +6,7 @@
+ #ifndef _PCIE_BRCMSTB_BOUNCE_H
+ #define _PCIE_BRCMSTB_BOUNCE_H
+-#ifdef CONFIG_ARM
++#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+ int brcm_pcie_bounce_init(struct device *dev, unsigned long buffer_size,
+                         dma_addr_t threshold);
+--- /dev/null
++++ b/drivers/pci/controller/pcie-brcmstb-bounce64.c
+@@ -0,0 +1,576 @@
++/*
++ *  This code started out as a version of arch/arm/common/dmabounce.c,
++ *  modified to cope with highmem pages. Now it has been changed heavily -
++ *  it now preallocates a large block (currently 4MB) and carves it up
++ *  sequentially in ring fashion, and DMA is used to copy the data - to the
++ *  point where very little of the original remains.
++ *
++ *  Copyright (C) 2019 Raspberry Pi (Trading) Ltd.
++ *
++ *  Original version by Brad Parker (brad@heeltoe.com)
++ *  Re-written by Christopher Hoover <ch@murgatroid.com>
++ *  Made generic by Deepak Saxena <dsaxena@plexity.net>
++ *
++ *  Copyright (C) 2002 Hewlett Packard Company.
++ *  Copyright (C) 2004 MontaVista Software, Inc.
++ *
++ *  This program is free software; you can redistribute it and/or
++ *  modify it under the terms of the GNU General Public License
++ *  version 2 as published by the Free Software Foundation.
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/page-flags.h>
++#include <linux/device.h>
++#include <linux/dma-mapping.h>
++#include <linux/dma-direct.h>
++#include <linux/dmapool.h>
++#include <linux/list.h>
++#include <linux/scatterlist.h>
++#include <linux/bitmap.h>
++#include <linux/swiotlb.h>
++
++#include <asm/cacheflush.h>
++
++#define STATS
++
++#ifdef STATS
++#define DO_STATS(X) do { X ; } while (0)
++#else
++#define DO_STATS(X) do { } while (0)
++#endif
++
++/* ************************************************** */
++
++struct safe_buffer {
++      struct list_head node;
++
++      /* original request */
++      size_t          size;
++      int             direction;
++
++      struct dmabounce_pool *pool;
++      void            *safe;
++      dma_addr_t      unsafe_dma_addr;
++      dma_addr_t      safe_dma_addr;
++};
++
++struct dmabounce_pool {
++      unsigned long   pages;
++      void            *virt_addr;
++      dma_addr_t      dma_addr;
++      unsigned long   *alloc_map;
++      unsigned long   alloc_pos;
++      spinlock_t      lock;
++      struct device   *dev;
++      unsigned long   num_pages;
++#ifdef STATS
++      size_t          max_size;
++      unsigned long   num_bufs;
++      unsigned long   max_bufs;
++      unsigned long   max_pages;
++#endif
++};
++
++struct dmabounce_device_info {
++      struct device *dev;
++      dma_addr_t threshold;
++      struct list_head safe_buffers;
++      struct dmabounce_pool pool;
++      rwlock_t lock;
++#ifdef STATS
++      unsigned long map_count;
++      unsigned long unmap_count;
++      unsigned long sync_dev_count;
++      unsigned long sync_cpu_count;
++      unsigned long fail_count;
++      int attr_res;
++#endif
++};
++
++static struct dmabounce_device_info *g_dmabounce_device_info;
++
++extern int bcm2838_dma40_memcpy_init(void);
++extern void bcm2838_dma40_memcpy(dma_addr_t dst, dma_addr_t src, size_t size);
++
++#ifdef STATS
++static ssize_t
++bounce_show(struct device *dev, struct device_attribute *attr, char *buf)
++{
++      struct dmabounce_device_info *device_info = g_dmabounce_device_info;
++      return sprintf(buf, "m:%lu/%lu s:%lu/%lu f:%lu s:%zu b:%lu/%lu a:%lu/%lu\n",
++              device_info->map_count,
++              device_info->unmap_count,
++              device_info->sync_dev_count,
++              device_info->sync_cpu_count,
++              device_info->fail_count,
++              device_info->pool.max_size,
++              device_info->pool.num_bufs,
++              device_info->pool.max_bufs,
++              device_info->pool.num_pages * PAGE_SIZE,
++              device_info->pool.max_pages * PAGE_SIZE);
++}
++
++static DEVICE_ATTR(dmabounce_stats, 0444, bounce_show, NULL);
++#endif
++
++static int bounce_create(struct dmabounce_pool *pool, struct device *dev,
++                       unsigned long buffer_size)
++{
++      int ret = -ENOMEM;
++      pool->pages = (buffer_size + PAGE_SIZE - 1)/PAGE_SIZE;
++      pool->alloc_map = bitmap_zalloc(pool->pages, GFP_KERNEL);
++      if (!pool->alloc_map)
++              goto err_bitmap;
++      pool->virt_addr = dma_alloc_coherent(dev, pool->pages * PAGE_SIZE,
++                                           &pool->dma_addr, GFP_KERNEL);
++      if (!pool->virt_addr)
++              goto err_dmabuf;
++
++      pool->alloc_pos = 0;
++      spin_lock_init(&pool->lock);
++      pool->dev = dev;
++      pool->num_pages = 0;
++
++      DO_STATS(pool->max_size = 0);
++      DO_STATS(pool->num_bufs = 0);
++      DO_STATS(pool->max_bufs = 0);
++      DO_STATS(pool->max_pages = 0);
++
++      return  0;
++
++err_dmabuf:
++      bitmap_free(pool->alloc_map);
++err_bitmap:
++      return ret;
++}
++
++static void bounce_destroy(struct dmabounce_pool *pool)
++{
++      dma_free_coherent(pool->dev, pool->pages * PAGE_SIZE, pool->virt_addr,
++                        pool->dma_addr);
++
++      bitmap_free(pool->alloc_map);
++}
++
++static void *bounce_alloc(struct dmabounce_pool *pool, size_t size,
++                        dma_addr_t *dmaaddrp)
++{
++      unsigned long pages;
++      unsigned long flags;
++      unsigned long pos;
++
++      pages = (size + PAGE_SIZE - 1)/PAGE_SIZE;
++
++      DO_STATS(pool->max_size = max(size, pool->max_size));
++
++      spin_lock_irqsave(&pool->lock, flags);
++      pos = bitmap_find_next_zero_area(pool->alloc_map, pool->pages,
++                                       pool->alloc_pos, pages, 0);
++      /* If not found, try from the start */
++      if (pos >= pool->pages && pool->alloc_pos)
++              pos = bitmap_find_next_zero_area(pool->alloc_map, pool->pages,
++                                               0, pages, 0);
++
++      if (pos >= pool->pages) {
++              spin_unlock_irqrestore(&pool->lock, flags);
++              return NULL;
++      }
++
++      bitmap_set(pool->alloc_map, pos, pages);
++      pool->alloc_pos = (pos + pages) % pool->pages;
++      pool->num_pages += pages;
++
++      DO_STATS(pool->num_bufs++);
++      DO_STATS(pool->max_bufs = max(pool->num_bufs, pool->max_bufs));
++      DO_STATS(pool->max_pages = max(pool->num_pages, pool->max_pages));
++
++      spin_unlock_irqrestore(&pool->lock, flags);
++
++      *dmaaddrp = pool->dma_addr + pos * PAGE_SIZE;
++
++      return pool->virt_addr + pos * PAGE_SIZE;
++}
++
++static void
++bounce_free(struct dmabounce_pool *pool, void *buf, size_t size)
++{
++      unsigned long pages;
++      unsigned long flags;
++      unsigned long pos;
++
++      pages = (size + PAGE_SIZE - 1)/PAGE_SIZE;
++      pos = (buf - pool->virt_addr)/PAGE_SIZE;
++
++      BUG_ON((buf - pool->virt_addr) & (PAGE_SIZE - 1));
++
++      spin_lock_irqsave(&pool->lock, flags);
++      bitmap_clear(pool->alloc_map, pos, pages);
++      pool->num_pages -= pages;
++      if (pool->num_pages == 0)
++              pool->alloc_pos = 0;
++      DO_STATS(pool->num_bufs--);
++      spin_unlock_irqrestore(&pool->lock, flags);
++}
++
++/* allocate a 'safe' buffer and keep track of it */
++static struct safe_buffer *
++alloc_safe_buffer(struct dmabounce_device_info *device_info,
++                dma_addr_t dma_addr, size_t size, enum dma_data_direction dir)
++{
++      struct safe_buffer *buf;
++      struct dmabounce_pool *pool = &device_info->pool;
++      struct device *dev = device_info->dev;
++      unsigned long flags;
++
++      /*
++       * Although one might expect this to be called in thread context,
++       * using GFP_KERNEL here leads to hard-to-debug lockups. in_atomic()
++       * was previously used to select the appropriate allocation mode,
++       * but this is unsafe.
++       */
++      buf = kmalloc(sizeof(struct safe_buffer), GFP_ATOMIC);
++      if (!buf) {
++              dev_warn(dev, "%s: kmalloc failed\n", __func__);
++              return NULL;
++      }
++
++      buf->unsafe_dma_addr = dma_addr;
++      buf->size = size;
++      buf->direction = dir;
++      buf->pool = pool;
++
++      buf->safe = bounce_alloc(pool, size, &buf->safe_dma_addr);
++
++      if (!buf->safe) {
++              dev_warn(dev,
++                       "%s: could not alloc dma memory (size=%d)\n",
++                       __func__, size);
++              kfree(buf);
++              return NULL;
++      }
++
++      write_lock_irqsave(&device_info->lock, flags);
++      list_add(&buf->node, &device_info->safe_buffers);
++      write_unlock_irqrestore(&device_info->lock, flags);
++
++      return buf;
++}
++
++/* determine if a buffer is from our "safe" pool */
++static struct safe_buffer *
++find_safe_buffer(struct dmabounce_device_info *device_info,
++               dma_addr_t safe_dma_addr)
++{
++      struct safe_buffer *b, *rb = NULL;
++      unsigned long flags;
++
++      read_lock_irqsave(&device_info->lock, flags);
++
++      list_for_each_entry(b, &device_info->safe_buffers, node)
++              if (b->safe_dma_addr <= safe_dma_addr &&
++                  b->safe_dma_addr + b->size > safe_dma_addr) {
++                      rb = b;
++                      break;
++              }
++
++      read_unlock_irqrestore(&device_info->lock, flags);
++      return rb;
++}
++
++static void
++free_safe_buffer(struct dmabounce_device_info *device_info,
++               struct safe_buffer *buf)
++{
++      unsigned long flags;
++
++      write_lock_irqsave(&device_info->lock, flags);
++      list_del(&buf->node);
++      write_unlock_irqrestore(&device_info->lock, flags);
++
++      bounce_free(buf->pool, buf->safe, buf->size);
++
++      kfree(buf);
++}
++
++/* ************************************************** */
++
++static struct safe_buffer *
++find_safe_buffer_dev(struct device *dev, dma_addr_t dma_addr, const char *where)
++{
++      if (!dev || !g_dmabounce_device_info)
++              return NULL;
++      if (dma_mapping_error(dev, dma_addr)) {
++              dev_err(dev, "Trying to %s invalid mapping\n", where);
++              return NULL;
++      }
++      return find_safe_buffer(g_dmabounce_device_info, dma_addr);
++}
++
++static dma_addr_t
++map_single(struct device *dev, struct safe_buffer *buf, size_t size,
++         enum dma_data_direction dir, unsigned long attrs)
++{
++      BUG_ON(buf->size != size);
++      BUG_ON(buf->direction != dir);
++
++      dev_dbg(dev, "map: %llx->%llx\n", (u64)buf->unsafe_dma_addr,
++              (u64)buf->safe_dma_addr);
++
++      if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) &&
++          !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
++              bcm2838_dma40_memcpy(buf->safe_dma_addr, buf->unsafe_dma_addr,
++                                   size);
++
++      return buf->safe_dma_addr;
++}
++
++static dma_addr_t
++unmap_single(struct device *dev, struct safe_buffer *buf, size_t size,
++           enum dma_data_direction dir, unsigned long attrs)
++{
++      BUG_ON(buf->size != size);
++      BUG_ON(buf->direction != dir);
++
++      if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) &&
++          !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
++              dev_dbg(dev, "unmap: %llx->%llx\n", (u64)buf->safe_dma_addr,
++                      (u64)buf->unsafe_dma_addr);
++
++              bcm2838_dma40_memcpy(buf->unsafe_dma_addr, buf->safe_dma_addr,
++                                   size);
++      }
++      return buf->unsafe_dma_addr;
++}
++
++/* ************************************************** */
++
++/*
++ * see if a buffer address is in an 'unsafe' range.  if it is
++ * allocate a 'safe' buffer and copy the unsafe buffer into it.
++ * substitute the safe buffer for the unsafe one.
++ * (basically move the buffer from an unsafe area to a safe one)
++ */
++static dma_addr_t
++dmabounce_map_page(struct device *dev, struct page *page, unsigned long offset,
++                 size_t size, enum dma_data_direction dir,
++                 unsigned long attrs)
++{
++      struct dmabounce_device_info *device_info = g_dmabounce_device_info;
++      dma_addr_t dma_addr;
++
++      dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset;
++
++      swiotlb_sync_single_for_device(dev, dma_addr, size, dir);
++        if (!is_device_dma_coherent(dev))
++              __dma_map_area(phys_to_virt(dma_to_phys(dev, dma_addr)), size, dir);
++
++      if (device_info && (dma_addr + size) > device_info->threshold) {
++              struct safe_buffer *buf;
++
++              buf = alloc_safe_buffer(device_info, dma_addr, size, dir);
++              if (!buf) {
++                      DO_STATS(device_info->fail_count++);
++                      return (~(dma_addr_t)0x0);
++              }
++
++              DO_STATS(device_info->map_count++);
++
++              dma_addr = map_single(dev, buf, size, dir, attrs);
++      }
++      return dma_addr;
++}
++
++/*
++ * see if a mapped address was really a "safe" buffer and if so, copy
++ * the data from the safe buffer back to the unsafe buffer and free up
++ * the safe buffer.  (basically return things back to the way they
++ * should be)
++ */
++static void
++dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
++                   enum dma_data_direction dir, unsigned long attrs)
++{
++      struct safe_buffer *buf;
++
++      buf = find_safe_buffer_dev(dev, dma_addr, __func__);
++      if (buf) {
++              DO_STATS(g_dmabounce_device_info->unmap_count++);
++              dma_addr = unmap_single(dev, buf, size, dir, attrs);
++              free_safe_buffer(g_dmabounce_device_info, buf);
++      }
++
++        if (!is_device_dma_coherent(dev))
++              __dma_unmap_area(phys_to_virt(dma_to_phys(dev, dma_addr)), size, dir);
++      swiotlb_sync_single_for_cpu(dev, dma_addr, size, dir);
++}
++
++/*
++ * A version of dmabounce_map_page that assumes the mapping has already
++ * been created - intended for streaming operation.
++ */
++static void
++dmabounce_sync_for_device(struct device *dev, dma_addr_t dma_addr, size_t size,
++                        enum dma_data_direction dir)
++{
++      struct safe_buffer *buf;
++
++        swiotlb_sync_single_for_device(dev, dma_addr, size, dir);
++        if (!is_device_dma_coherent(dev))
++                __dma_map_area(phys_to_virt(dma_to_phys(dev, dma_addr)), size, dir);
++
++      buf = find_safe_buffer_dev(dev, dma_addr, __func__);
++      if (buf) {
++              DO_STATS(g_dmabounce_device_info->sync_dev_count++);
++              map_single(dev, buf, size, dir, 0);
++      }
++}
++
++/*
++ * A version of dmabounce_unmap_page that doesn't destroy the mapping -
++ * intended for streaming operation.
++ */
++static void
++dmabounce_sync_for_cpu(struct device *dev, dma_addr_t dma_addr,
++                     size_t size, enum dma_data_direction dir)
++{
++      struct safe_buffer *buf;
++
++      buf = find_safe_buffer_dev(dev, dma_addr, __func__);
++      if (buf) {
++              DO_STATS(g_dmabounce_device_info->sync_cpu_count++);
++              dma_addr = unmap_single(dev, buf, size, dir, 0);
++      }
++
++        if (!is_device_dma_coherent(dev))
++                __dma_unmap_area(phys_to_virt(dma_to_phys(dev, dma_addr)), size, dir);
++        swiotlb_sync_single_for_cpu(dev, dma_addr, size, dir);
++}
++
++static int dmabounce_dma_supported(struct device *dev, u64 dma_mask)
++{
++      if (g_dmabounce_device_info)
++              return 0;
++
++      return swiotlb_dma_supported(dev, dma_mask);
++}
++
++static int dmabounce_mapping_error(struct device *dev, dma_addr_t dma_addr)
++{
++      return swiotlb_dma_mapping_error(dev, dma_addr);
++}
++
++static const struct dma_map_ops dmabounce_ops = {
++      .alloc                  = arm64_dma_alloc,
++      .free                   = arm64_dma_free,
++      .mmap                   = arm64_dma_mmap,
++      .get_sgtable            = arm64_dma_get_sgtable,
++      .map_page               = dmabounce_map_page,
++      .unmap_page             = dmabounce_unmap_page,
++      .sync_single_for_cpu    = dmabounce_sync_for_cpu,
++      .sync_single_for_device = dmabounce_sync_for_device,
++      .map_sg                 = arm64_dma_map_sg,
++      .unmap_sg               = arm64_dma_unmap_sg,
++      .sync_sg_for_cpu        = arm64_dma_sync_sg_for_cpu,
++      .sync_sg_for_device     = arm64_dma_sync_sg_for_device,
++      .dma_supported          = dmabounce_dma_supported,
++      .mapping_error          = dmabounce_mapping_error,
++};
++
++int brcm_pcie_bounce_init(struct device *dev,
++                        unsigned long buffer_size,
++                        dma_addr_t threshold)
++{
++      struct dmabounce_device_info *device_info;
++      int ret;
++
++      /* Only support a single client */
++      if (g_dmabounce_device_info)
++              return -EBUSY;
++
++      ret = bcm2838_dma40_memcpy_init();
++      if (ret)
++              return ret;
++
++      device_info = kmalloc(sizeof(struct dmabounce_device_info), GFP_ATOMIC);
++      if (!device_info) {
++              dev_err(dev,
++                      "Could not allocated dmabounce_device_info\n");
++              return -ENOMEM;
++      }
++
++      ret = bounce_create(&device_info->pool, dev, buffer_size);
++      if (ret) {
++              dev_err(dev,
++                      "dmabounce: could not allocate %ld byte DMA pool\n",
++                      buffer_size);
++              goto err_bounce;
++      }
++
++      device_info->dev = dev;
++      device_info->threshold = threshold;
++      INIT_LIST_HEAD(&device_info->safe_buffers);
++      rwlock_init(&device_info->lock);
++
++      DO_STATS(device_info->map_count = 0);
++      DO_STATS(device_info->unmap_count = 0);
++      DO_STATS(device_info->sync_dev_count = 0);
++      DO_STATS(device_info->sync_cpu_count = 0);
++      DO_STATS(device_info->fail_count = 0);
++      DO_STATS(device_info->attr_res =
++               device_create_file(dev, &dev_attr_dmabounce_stats));
++
++      g_dmabounce_device_info = device_info;
++
++      dev_err(dev, "dmabounce: initialised - %ld kB, threshold %pad\n",
++               buffer_size / 1024, &threshold);
++
++      return 0;
++
++ err_bounce:
++      kfree(device_info);
++      return ret;
++}
++EXPORT_SYMBOL(brcm_pcie_bounce_init);
++
++void brcm_pcie_bounce_uninit(struct device *dev)
++{
++      struct dmabounce_device_info *device_info = g_dmabounce_device_info;
++
++      g_dmabounce_device_info = NULL;
++
++      if (!device_info) {
++              dev_warn(dev,
++                       "Never registered with dmabounce but attempting"
++                       "to unregister!\n");
++              return;
++      }
++
++      if (!list_empty(&device_info->safe_buffers)) {
++              dev_err(dev,
++                      "Removing from dmabounce with pending buffers!\n");
++              BUG();
++      }
++
++      bounce_destroy(&device_info->pool);
++
++      DO_STATS(if (device_info->attr_res == 0)
++                       device_remove_file(dev, &dev_attr_dmabounce_stats));
++
++      kfree(device_info);
++}
++EXPORT_SYMBOL(brcm_pcie_bounce_uninit);
++
++int brcm_pcie_bounce_register_dev(struct device *dev)
++{
++      set_dma_ops(dev, &dmabounce_ops);
++
++      return 0;
++}
++EXPORT_SYMBOL(brcm_pcie_bounce_register_dev);
++
++MODULE_AUTHOR("Phil Elwell <phil@raspberrypi.org>");
++MODULE_DESCRIPTION("Dedicate DMA bounce support for pcie-brcmstb");
++MODULE_LICENSE("GPL");
+--- a/drivers/pci/controller/pcie-brcmstb.c
++++ b/drivers/pci/controller/pcie-brcmstb.c
+@@ -617,28 +617,6 @@ static const struct dma_map_ops brcm_dma
+ static void brcm_set_dma_ops(struct device *dev)
+ {
+-      int ret;
+-
+-      if (IS_ENABLED(CONFIG_ARM64)) {
+-              /*
+-               * We are going to invoke get_dma_ops().  That
+-               * function, at this point in time, invokes
+-               * get_arch_dma_ops(), and for ARM64 that function
+-               * returns a pointer to dummy_dma_ops.  So then we'd
+-               * like to call arch_setup_dma_ops(), but that isn't
+-               * exported.  Instead, we call of_dma_configure(),
+-               * which is exported, and this calls
+-               * arch_setup_dma_ops().  Once we do this the call to
+-               * get_dma_ops() will work properly because
+-               * dev->dma_ops will be set.
+-               */
+-              ret = of_dma_configure(dev, dev->of_node, true);
+-              if (ret) {
+-                      dev_err(dev, "of_dma_configure() failed: %d\n", ret);
+-                      return;
+-              }
+-      }
+-
+       arch_dma_ops = get_dma_ops(dev);
+       if (!arch_dma_ops) {
+               dev_err(dev, "failed to get arch_dma_ops\n");
+@@ -657,12 +635,12 @@ static int brcmstb_platform_notifier(str
+       extern unsigned long max_pfn;
+       struct device *dev = __dev;
+       const char *rc_name = "0000:00:00.0";
++      int ret;
+       switch (event) {
+       case BUS_NOTIFY_ADD_DEVICE:
+               if (max_pfn > (bounce_threshold/PAGE_SIZE) &&
+                   strcmp(dev->kobj.name, rc_name)) {
+-                      int ret;
+                       ret = brcm_pcie_bounce_register_dev(dev);
+                       if (ret) {
+@@ -671,6 +649,12 @@ static int brcmstb_platform_notifier(str
+                                       ret);
+                               return ret;
+                       }
++              } else if (IS_ENABLED(CONFIG_ARM64)) {
++                      ret = of_dma_configure(dev, dev->of_node, true);
++                      if (ret) {
++                              dev_err(dev, "of_dma_configure() failed: %d\n", ret);
++                              return;
++                      }
+               }
+               brcm_set_dma_ops(dev);
+               return NOTIFY_OK;