1 From 370c8243ec8e7f3abd8171b7d2dde170f4c5e63a Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
3 Date: Mon, 17 Jun 2013 16:00:25 +0300
4 Subject: [PATCH 070/196] bcm2708_fb: DMA acceleration for fb_copyarea
6 Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
7 Also used Simon's dmaer_master module as a reference for tweaking DMA
8 settings for better performance.
10 For now busylooping only. IRQ support might be added later.
11 With non-overclocked Raspberry Pi, the performance is ~360 MB/s
12 for simple copy or ~260 MB/s for two-pass copy (used when dragging
13 windows to the right).
15 In the case of using DMA channel 0, the performance improves
18 For comparison, VFP optimized CPU copy can only do ~114 MB/s in
19 the same conditions (hindered by reading uncached source buffer).
21 Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
23 drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++-
24 1 file changed, 159 insertions(+), 3 deletions(-)
26 diff --git a/drivers/video/bcm2708_fb.c b/drivers/video/bcm2708_fb.c
27 index 08d9238..c10c5ee 100644
28 --- a/drivers/video/bcm2708_fb.c
29 +++ b/drivers/video/bcm2708_fb.c
31 #include <linux/printk.h>
32 #include <linux/console.h>
34 +#include <mach/dma.h>
35 #include <mach/platform.h>
36 #include <mach/vcio.h>
38 @@ -63,6 +64,11 @@ struct bcm2708_fb {
39 struct fbinfo_s *info;
44 + void __iomem *dma_chan_base;
45 + void *cb_base; /* DMA control blocks */
46 + dma_addr_t cb_handle;
49 #define to_bcm2708(info) container_of(info, struct bcm2708_fb, fb)
50 @@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct fb_info *info,
51 cfb_fillrect(info, rect);
54 +/* A helper function for configuring dma control block */
55 +static void set_dma_cb(struct bcm2708_dma_cb *cb,
64 + cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
65 + BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
66 + BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
70 + * This is not really obvious from the DMA documentation,
71 + * but the top 16 bits must be programmmed to "height -1"
72 + * and not "height" in 2D mode.
74 + cb->length = ((h - 1) << 16) | w;
75 + cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
80 static void bcm2708_fb_copyarea(struct fb_info *info,
81 const struct fb_copyarea *region)
83 - /*print_debug("bcm2708_fb_copyarea\n"); */
84 - cfb_copyarea(info, region);
85 + struct bcm2708_fb *fb = to_bcm2708(info);
86 + struct bcm2708_dma_cb *cb = fb->cb_base;
87 + int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
88 + /* Channel 0 supports larger bursts and is a bit faster */
89 + int burst_size = (fb->dma_chan == 0) ? 8 : 2;
91 + /* Fallback to cfb_copyarea() if we don't like something */
92 + if (bytes_per_pixel > 4 ||
93 + info->var.xres > 1920 || info->var.yres > 1200 ||
94 + region->width <= 0 || region->width > info->var.xres ||
95 + region->height <= 0 || region->height > info->var.yres ||
96 + region->sx < 0 || region->sx >= info->var.xres ||
97 + region->sy < 0 || region->sy >= info->var.yres ||
98 + region->dx < 0 || region->dx >= info->var.xres ||
99 + region->dy < 0 || region->dy >= info->var.yres ||
100 + region->sx + region->width > info->var.xres ||
101 + region->dx + region->width > info->var.xres ||
102 + region->sy + region->height > info->var.yres ||
103 + region->dy + region->height > info->var.yres) {
104 + cfb_copyarea(info, region);
108 + if (region->dy == region->sy && region->dx > region->sx) {
110 + * A difficult case of overlapped copy. Because DMA can't
111 + * copy individual scanlines in backwards direction, we need
112 + * two-pass processing. We do it by programming a chain of dma
113 + * control blocks in the first 16K part of the buffer and use
114 + * the remaining 48K as the intermediate temporary scratch
115 + * buffer. The buffer size is sufficient to handle up to
116 + * 1920x1200 resolution at 32bpp pixel depth.
119 + dma_addr_t control_block_pa = fb->cb_handle;
120 + dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
121 + int scanline_size = bytes_per_pixel * region->width;
122 + int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
124 + for (y = 0; y < region->height; y += scanlines_per_cb) {
126 + fb->fb.fix.smem_start +
127 + bytes_per_pixel * region->sx +
128 + (region->sy + y) * fb->fb.fix.line_length;
130 + fb->fb.fix.smem_start +
131 + bytes_per_pixel * region->dx +
132 + (region->dy + y) * fb->fb.fix.line_length;
134 + if (region->height - y < scanlines_per_cb)
135 + scanlines_per_cb = region->height - y;
137 + set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
138 + src, fb->fb.fix.line_length,
139 + scanline_size, scanlines_per_cb);
140 + control_block_pa += sizeof(struct bcm2708_dma_cb);
141 + cb->next = control_block_pa;
144 + set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
145 + scratchbuf, scanline_size,
146 + scanline_size, scanlines_per_cb);
147 + control_block_pa += sizeof(struct bcm2708_dma_cb);
148 + cb->next = control_block_pa;
151 + /* move the pointer back to the last dma control block */
154 + /* A single dma control block is enough. */
155 + int sy, dy, stride;
156 + if (region->dy <= region->sy) {
157 + /* processing from top to bottom */
160 + stride = fb->fb.fix.line_length;
162 + /* processing from bottom to top */
163 + dy = region->dy + region->height - 1;
164 + sy = region->sy + region->height - 1;
165 + stride = -fb->fb.fix.line_length;
167 + set_dma_cb(cb, burst_size,
168 + fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
169 + bytes_per_pixel * region->dx,
171 + fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
172 + bytes_per_pixel * region->sx,
174 + region->width * bytes_per_pixel,
178 + /* end of dma control blocks chain */
181 + bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
182 + bcm_dma_wait_idle(fb->dma_chan_base);
185 static void bcm2708_fb_imageblit(struct fb_info *info,
186 @@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bcm2708_fb *fb)
189 fb->fb.fbops = &bcm2708_fb_ops;
190 - fb->fb.flags = FBINFO_FLAG_DEFAULT;
191 + fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
192 fb->fb.pseudo_palette = fb->cmap;
194 strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
195 @@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platform_device *dev)
197 memset(fb, 0, sizeof(struct bcm2708_fb));
199 + fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
200 + &fb->cb_handle, GFP_KERNEL);
201 + if (!fb->cb_base) {
202 + dev_err(&dev->dev, "cannot allocate DMA CBs\n");
207 + pr_info("BCM2708FB: allocated DMA memory %08x\n",
210 + ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
211 + &fb->dma_chan_base, &fb->dma_irq);
213 + dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
216 + fb->dma_chan = ret;
218 + pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
219 + fb->dma_chan, fb->dma_chan_base);
223 ret = bcm2708_fb_register(fb);
224 @@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platform_device *dev)
229 + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
233 dev_err(&dev->dev, "probe failed, err %d\n", ret);
234 @@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct platform_device *dev)
235 iounmap(fb->fb.screen_base);
236 unregister_framebuffer(&fb->fb);
238 + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
239 + bcm_dma_chan_free(fb->dma_chan);
241 dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,