kernel: merge upstream bgmac driver improvements
authorFelix Fietkau <nbd@openwrt.org>
Wed, 25 Mar 2015 14:30:46 +0000 (14:30 +0000)
committerFelix Fietkau <nbd@openwrt.org>
Wed, 25 Mar 2015 14:30:46 +0000 (14:30 +0000)
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
SVN-Revision: 44978

target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch [new file with mode: 0644]
target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch [new file with mode: 0644]
target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch [new file with mode: 0644]

diff --git a/target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch b/target/linux/generic/patches-3.18/077-01-bgmac-fix-descriptor-frame-start-end-definitions.patch
new file mode 100644 (file)
index 0000000..fdfae3a
--- /dev/null
@@ -0,0 +1,24 @@
+From: Felix Fietkau <nbd@openwrt.org>
+Date: Mon, 23 Mar 2015 02:40:06 +0100
+Subject: [PATCH] bgmac: fix descriptor frame start/end definitions
+
+The start-of-frame and end-of-frame bits were accidentally swapped.
+In the current code it does not make any difference, since they are
+always used together.
+
+Signed-off-by: Felix Fietkau <nbd@openwrt.org>
+---
+
+--- a/drivers/net/ethernet/broadcom/bgmac.h
++++ b/drivers/net/ethernet/broadcom/bgmac.h
+@@ -345,8 +345,8 @@
+ #define BGMAC_DESC_CTL0_EOT                   0x10000000      /* End of ring */
+ #define BGMAC_DESC_CTL0_IOC                   0x20000000      /* IRQ on complete */
+-#define BGMAC_DESC_CTL0_SOF                   0x40000000      /* Start of frame */
+-#define BGMAC_DESC_CTL0_EOF                   0x80000000      /* End of frame */
++#define BGMAC_DESC_CTL0_EOF                   0x40000000      /* End of frame */
++#define BGMAC_DESC_CTL0_SOF                   0x80000000      /* Start of frame */
+ #define BGMAC_DESC_CTL1_LEN                   0x00001FFF
+ #define BGMAC_PHY_NOREGS                      0x1E
diff --git a/target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch b/target/linux/generic/patches-3.18/077-02-bgmac-implement-GRO-and-use-build_skb.patch
new file mode 100644 (file)
index 0000000..3636fb6
--- /dev/null
@@ -0,0 +1,189 @@
+From: Felix Fietkau <nbd@openwrt.org>
+Date: Mon, 23 Mar 2015 02:41:25 +0100
+Subject: [PATCH] bgmac: implement GRO and use build_skb
+
+This improves performance for routing and local rx
+
+Signed-off-by: Felix Fietkau <nbd@openwrt.org>
+---
+
+--- a/drivers/net/ethernet/broadcom/bgmac.c
++++ b/drivers/net/ethernet/broadcom/bgmac.c
+@@ -276,31 +276,31 @@ static int bgmac_dma_rx_skb_for_slot(str
+                                    struct bgmac_slot_info *slot)
+ {
+       struct device *dma_dev = bgmac->core->dma_dev;
+-      struct sk_buff *skb;
+       dma_addr_t dma_addr;
+       struct bgmac_rx_header *rx;
++      void *buf;
+       /* Alloc skb */
+-      skb = netdev_alloc_skb(bgmac->net_dev, BGMAC_RX_BUF_SIZE);
+-      if (!skb)
++      buf = netdev_alloc_frag(BGMAC_RX_ALLOC_SIZE);
++      if (!buf)
+               return -ENOMEM;
+       /* Poison - if everything goes fine, hardware will overwrite it */
+-      rx = (struct bgmac_rx_header *)skb->data;
++      rx = buf;
+       rx->len = cpu_to_le16(0xdead);
+       rx->flags = cpu_to_le16(0xbeef);
+       /* Map skb for the DMA */
+-      dma_addr = dma_map_single(dma_dev, skb->data,
+-                                BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE);
++      dma_addr = dma_map_single(dma_dev, buf, BGMAC_RX_BUF_SIZE,
++                                DMA_FROM_DEVICE);
+       if (dma_mapping_error(dma_dev, dma_addr)) {
+               bgmac_err(bgmac, "DMA mapping error\n");
+-              dev_kfree_skb(skb);
++              put_page(virt_to_head_page(buf));
+               return -ENOMEM;
+       }
+       /* Update the slot */
+-      slot->skb = skb;
++      slot->buf = buf;
+       slot->dma_addr = dma_addr;
+       return 0;
+@@ -343,8 +343,9 @@ static int bgmac_dma_rx_read(struct bgma
+       while (ring->start != ring->end) {
+               struct device *dma_dev = bgmac->core->dma_dev;
+               struct bgmac_slot_info *slot = &ring->slots[ring->start];
+-              struct sk_buff *skb = slot->skb;
+-              struct bgmac_rx_header *rx;
++              struct bgmac_rx_header *rx = slot->buf;
++              struct sk_buff *skb;
++              void *buf = slot->buf;
+               u16 len, flags;
+               /* Unmap buffer to make it accessible to the CPU */
+@@ -352,7 +353,6 @@ static int bgmac_dma_rx_read(struct bgma
+                                       BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE);
+               /* Get info from the header */
+-              rx = (struct bgmac_rx_header *)skb->data;
+               len = le16_to_cpu(rx->len);
+               flags = le16_to_cpu(rx->flags);
+@@ -393,12 +393,13 @@ static int bgmac_dma_rx_read(struct bgma
+                       dma_unmap_single(dma_dev, old_dma_addr,
+                                        BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE);
++                      skb = build_skb(buf, BGMAC_RX_ALLOC_SIZE);
+                       skb_put(skb, BGMAC_RX_FRAME_OFFSET + len);
+                       skb_pull(skb, BGMAC_RX_FRAME_OFFSET);
+                       skb_checksum_none_assert(skb);
+                       skb->protocol = eth_type_trans(skb, bgmac->net_dev);
+-                      netif_receive_skb(skb);
++                      napi_gro_receive(&bgmac->napi, skb);
+                       handled++;
+               } while (0);
+@@ -434,12 +435,11 @@ static bool bgmac_dma_unaligned(struct b
+       return false;
+ }
+-static void bgmac_dma_ring_free(struct bgmac *bgmac,
+-                              struct bgmac_dma_ring *ring)
++static void bgmac_dma_tx_ring_free(struct bgmac *bgmac,
++                                 struct bgmac_dma_ring *ring)
+ {
+       struct device *dma_dev = bgmac->core->dma_dev;
+       struct bgmac_slot_info *slot;
+-      int size;
+       int i;
+       for (i = 0; i < ring->num_slots; i++) {
+@@ -451,23 +451,55 @@ static void bgmac_dma_ring_free(struct b
+                       dev_kfree_skb(slot->skb);
+               }
+       }
++}
+-      if (ring->cpu_base) {
+-              /* Free ring of descriptors */
+-              size = ring->num_slots * sizeof(struct bgmac_dma_desc);
+-              dma_free_coherent(dma_dev, size, ring->cpu_base,
+-                                ring->dma_base);
++static void bgmac_dma_rx_ring_free(struct bgmac *bgmac,
++                                 struct bgmac_dma_ring *ring)
++{
++      struct device *dma_dev = bgmac->core->dma_dev;
++      struct bgmac_slot_info *slot;
++      int i;
++
++      for (i = 0; i < ring->num_slots; i++) {
++              slot = &ring->slots[i];
++              if (!slot->buf)
++                      continue;
++
++              if (slot->dma_addr)
++                      dma_unmap_single(dma_dev, slot->dma_addr,
++                                       BGMAC_RX_BUF_SIZE,
++                                       DMA_FROM_DEVICE);
++              put_page(virt_to_head_page(slot->buf));
+       }
+ }
++static void bgmac_dma_ring_desc_free(struct bgmac *bgmac,
++                                   struct bgmac_dma_ring *ring)
++{
++      struct device *dma_dev = bgmac->core->dma_dev;
++      int size;
++
++      if (!ring->cpu_base)
++          return;
++
++      /* Free ring of descriptors */
++      size = ring->num_slots * sizeof(struct bgmac_dma_desc);
++      dma_free_coherent(dma_dev, size, ring->cpu_base,
++                        ring->dma_base);
++}
++
+ static void bgmac_dma_free(struct bgmac *bgmac)
+ {
+       int i;
+-      for (i = 0; i < BGMAC_MAX_TX_RINGS; i++)
+-              bgmac_dma_ring_free(bgmac, &bgmac->tx_ring[i]);
+-      for (i = 0; i < BGMAC_MAX_RX_RINGS; i++)
+-              bgmac_dma_ring_free(bgmac, &bgmac->rx_ring[i]);
++      for (i = 0; i < BGMAC_MAX_TX_RINGS; i++) {
++              bgmac_dma_tx_ring_free(bgmac, &bgmac->tx_ring[i]);
++              bgmac_dma_ring_desc_free(bgmac, &bgmac->tx_ring[i]);
++      }
++      for (i = 0; i < BGMAC_MAX_RX_RINGS; i++) {
++              bgmac_dma_rx_ring_free(bgmac, &bgmac->rx_ring[i]);
++              bgmac_dma_ring_desc_free(bgmac, &bgmac->rx_ring[i]);
++      }
+ }
+ static int bgmac_dma_alloc(struct bgmac *bgmac)
+--- a/drivers/net/ethernet/broadcom/bgmac.h
++++ b/drivers/net/ethernet/broadcom/bgmac.h
+@@ -362,6 +362,8 @@
+ #define BGMAC_RX_FRAME_OFFSET                 30              /* There are 2 unused bytes between header and real data */
+ #define BGMAC_RX_MAX_FRAME_SIZE                       1536            /* Copied from b44/tg3 */
+ #define BGMAC_RX_BUF_SIZE                     (BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE)
++#define BGMAC_RX_ALLOC_SIZE                   (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE) + \
++                                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+ #define BGMAC_BFL_ENETROBO                    0x0010          /* has ephy roboswitch spi */
+ #define BGMAC_BFL_ENETADM                     0x0080          /* has ADMtek switch */
+@@ -383,7 +385,10 @@
+ #define ETHER_MAX_LEN   1518
+ struct bgmac_slot_info {
+-      struct sk_buff *skb;
++      union {
++              struct sk_buff *skb;
++              void *buf;
++      };
+       dma_addr_t dma_addr;
+ };
diff --git a/target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch b/target/linux/generic/patches-3.18/077-03-bgmac-implement-scatter-gather-support.patch
new file mode 100644 (file)
index 0000000..5cb21a5
--- /dev/null
@@ -0,0 +1,267 @@
+From: Felix Fietkau <nbd@openwrt.org>
+Date: Mon, 23 Mar 2015 02:42:26 +0100
+Subject: [PATCH] bgmac: implement scatter/gather support
+
+Always use software checksumming, since the hardware does not have any
+checksum offload support.
+This significantly improves local TCP tx performance.
+
+Signed-off-by: Felix Fietkau <nbd@openwrt.org>
+---
+
+--- a/drivers/net/ethernet/broadcom/bgmac.c
++++ b/drivers/net/ethernet/broadcom/bgmac.c
+@@ -115,53 +115,91 @@ static void bgmac_dma_tx_enable(struct b
+       bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_TX_CTL, ctl);
+ }
++static void
++bgmac_dma_tx_add_buf(struct bgmac *bgmac, struct bgmac_dma_ring *ring,
++                   int i, int len, u32 ctl0)
++{
++      struct bgmac_slot_info *slot;
++      struct bgmac_dma_desc *dma_desc;
++      u32 ctl1;
++
++      if (i == ring->num_slots - 1)
++              ctl0 |= BGMAC_DESC_CTL0_EOT;
++
++      ctl1 = len & BGMAC_DESC_CTL1_LEN;
++
++      slot = &ring->slots[i];
++      dma_desc = &ring->cpu_base[i];
++      dma_desc->addr_low = cpu_to_le32(lower_32_bits(slot->dma_addr));
++      dma_desc->addr_high = cpu_to_le32(upper_32_bits(slot->dma_addr));
++      dma_desc->ctl0 = cpu_to_le32(ctl0);
++      dma_desc->ctl1 = cpu_to_le32(ctl1);
++}
++
+ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
+                                   struct bgmac_dma_ring *ring,
+                                   struct sk_buff *skb)
+ {
+       struct device *dma_dev = bgmac->core->dma_dev;
+       struct net_device *net_dev = bgmac->net_dev;
+-      struct bgmac_dma_desc *dma_desc;
+-      struct bgmac_slot_info *slot;
+-      u32 ctl0, ctl1;
++      struct bgmac_slot_info *slot = &ring->slots[ring->end];
+       int free_slots;
++      int nr_frags;
++      u32 flags;
++      int index = ring->end;
++      int i;
+       if (skb->len > BGMAC_DESC_CTL1_LEN) {
+               bgmac_err(bgmac, "Too long skb (%d)\n", skb->len);
+-              goto err_stop_drop;
++              goto err_drop;
+       }
++      if (skb->ip_summed == CHECKSUM_PARTIAL)
++              skb_checksum_help(skb);
++
++      nr_frags = skb_shinfo(skb)->nr_frags;
++
+       if (ring->start <= ring->end)
+               free_slots = ring->start - ring->end + BGMAC_TX_RING_SLOTS;
+       else
+               free_slots = ring->start - ring->end;
+-      if (free_slots == 1) {
++
++      if (free_slots <= nr_frags + 1) {
+               bgmac_err(bgmac, "TX ring is full, queue should be stopped!\n");
+               netif_stop_queue(net_dev);
+               return NETDEV_TX_BUSY;
+       }
+-      slot = &ring->slots[ring->end];
+-      slot->skb = skb;
+-      slot->dma_addr = dma_map_single(dma_dev, skb->data, skb->len,
++      slot->dma_addr = dma_map_single(dma_dev, skb->data, skb_headlen(skb),
+                                       DMA_TO_DEVICE);
+-      if (dma_mapping_error(dma_dev, slot->dma_addr)) {
+-              bgmac_err(bgmac, "Mapping error of skb on ring 0x%X\n",
+-                        ring->mmio_base);
+-              goto err_stop_drop;
+-      }
++      if (unlikely(dma_mapping_error(dma_dev, slot->dma_addr)))
++              goto err_dma_head;
+-      ctl0 = BGMAC_DESC_CTL0_IOC | BGMAC_DESC_CTL0_SOF | BGMAC_DESC_CTL0_EOF;
+-      if (ring->end == ring->num_slots - 1)
+-              ctl0 |= BGMAC_DESC_CTL0_EOT;
+-      ctl1 = skb->len & BGMAC_DESC_CTL1_LEN;
++      flags = BGMAC_DESC_CTL0_SOF;
++      if (!nr_frags)
++              flags |= BGMAC_DESC_CTL0_EOF | BGMAC_DESC_CTL0_IOC;
++
++      bgmac_dma_tx_add_buf(bgmac, ring, index, skb_headlen(skb), flags);
++      flags = 0;
++
++      for (i = 0; i < nr_frags; i++) {
++              struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
++              int len = skb_frag_size(frag);
++
++              index = (index + 1) % BGMAC_TX_RING_SLOTS;
++              slot = &ring->slots[index];
++              slot->dma_addr = skb_frag_dma_map(dma_dev, frag, 0,
++                                                len, DMA_TO_DEVICE);
++              if (unlikely(dma_mapping_error(dma_dev, slot->dma_addr)))
++                      goto err_dma;
+-      dma_desc = ring->cpu_base;
+-      dma_desc += ring->end;
+-      dma_desc->addr_low = cpu_to_le32(lower_32_bits(slot->dma_addr));
+-      dma_desc->addr_high = cpu_to_le32(upper_32_bits(slot->dma_addr));
+-      dma_desc->ctl0 = cpu_to_le32(ctl0);
+-      dma_desc->ctl1 = cpu_to_le32(ctl1);
++              if (i == nr_frags - 1)
++                      flags |= BGMAC_DESC_CTL0_EOF | BGMAC_DESC_CTL0_IOC;
++
++              bgmac_dma_tx_add_buf(bgmac, ring, index, len, flags);
++      }
++
++      slot->skb = skb;
+       netdev_sent_queue(net_dev, skb->len);
+@@ -170,20 +208,35 @@ static netdev_tx_t bgmac_dma_tx_add(stru
+       /* Increase ring->end to point empty slot. We tell hardware the first
+        * slot it should *not* read.
+        */
+-      if (++ring->end >= BGMAC_TX_RING_SLOTS)
+-              ring->end = 0;
++      ring->end = (index + 1) % BGMAC_TX_RING_SLOTS;
+       bgmac_write(bgmac, ring->mmio_base + BGMAC_DMA_TX_INDEX,
+                   ring->index_base +
+                   ring->end * sizeof(struct bgmac_dma_desc));
+-      /* Always keep one slot free to allow detecting bugged calls. */
+-      if (--free_slots == 1)
++      free_slots -= nr_frags + 1;
++      if (free_slots < 8)
+               netif_stop_queue(net_dev);
+       return NETDEV_TX_OK;
+-err_stop_drop:
+-      netif_stop_queue(net_dev);
++err_dma:
++      dma_unmap_single(dma_dev, slot->dma_addr, skb_headlen(skb),
++                       DMA_TO_DEVICE);
++
++      while (i > 0) {
++              int index = (ring->end + i) % BGMAC_TX_RING_SLOTS;
++              struct bgmac_slot_info *slot = &ring->slots[index];
++              u32 ctl1 = le32_to_cpu(ring->cpu_base[index].ctl1);
++              int len = ctl1 & BGMAC_DESC_CTL1_LEN;
++
++              dma_unmap_page(dma_dev, slot->dma_addr, len, DMA_TO_DEVICE);
++      }
++
++err_dma_head:
++      bgmac_err(bgmac, "Mapping error of skb on ring 0x%X\n",
++                ring->mmio_base);
++
++err_drop:
+       dev_kfree_skb(skb);
+       return NETDEV_TX_OK;
+ }
+@@ -205,32 +258,45 @@ static void bgmac_dma_tx_free(struct bgm
+       while (ring->start != empty_slot) {
+               struct bgmac_slot_info *slot = &ring->slots[ring->start];
++              u32 ctl1 = le32_to_cpu(ring->cpu_base[ring->start].ctl1);
++              int len = ctl1 & BGMAC_DESC_CTL1_LEN;
+-              if (slot->skb) {
++              if (!slot->dma_addr) {
++                      bgmac_err(bgmac, "Hardware reported transmission for empty TX ring slot %d! End of ring: %d\n",
++                                ring->start, ring->end);
++                      goto next;
++              }
++
++              if (ctl1 & BGMAC_DESC_CTL0_SOF)
+                       /* Unmap no longer used buffer */
+-                      dma_unmap_single(dma_dev, slot->dma_addr,
+-                                       slot->skb->len, DMA_TO_DEVICE);
+-                      slot->dma_addr = 0;
++                      dma_unmap_single(dma_dev, slot->dma_addr, len,
++                                       DMA_TO_DEVICE);
++              else
++                      dma_unmap_page(dma_dev, slot->dma_addr, len,
++                                     DMA_TO_DEVICE);
++              if (slot->skb) {
+                       bytes_compl += slot->skb->len;
+                       pkts_compl++;
+                       /* Free memory! :) */
+                       dev_kfree_skb(slot->skb);
+                       slot->skb = NULL;
+-              } else {
+-                      bgmac_err(bgmac, "Hardware reported transmission for empty TX ring slot %d! End of ring: %d\n",
+-                                ring->start, ring->end);
+               }
++next:
++              slot->dma_addr = 0;
+               if (++ring->start >= BGMAC_TX_RING_SLOTS)
+                       ring->start = 0;
+               freed = true;
+       }
++      if (!pkts_compl)
++              return;
++
+       netdev_completed_queue(bgmac->net_dev, pkts_compl, bytes_compl);
+-      if (freed && netif_queue_stopped(bgmac->net_dev))
++      if (netif_queue_stopped(bgmac->net_dev))
+               netif_wake_queue(bgmac->net_dev);
+ }
+@@ -439,17 +505,25 @@ static void bgmac_dma_tx_ring_free(struc
+                                  struct bgmac_dma_ring *ring)
+ {
+       struct device *dma_dev = bgmac->core->dma_dev;
++      struct bgmac_dma_desc *dma_desc = ring->cpu_base;
+       struct bgmac_slot_info *slot;
+       int i;
+       for (i = 0; i < ring->num_slots; i++) {
++              int len = dma_desc[i].ctl1 & BGMAC_DESC_CTL1_LEN;
++
+               slot = &ring->slots[i];
+-              if (slot->skb) {
+-                      if (slot->dma_addr)
+-                              dma_unmap_single(dma_dev, slot->dma_addr,
+-                                               slot->skb->len, DMA_TO_DEVICE);
+-                      dev_kfree_skb(slot->skb);
+-              }
++              dev_kfree_skb(slot->skb);
++
++              if (!slot->dma_addr)
++                      continue;
++
++              if (slot->skb)
++                      dma_unmap_single(dma_dev, slot->dma_addr,
++                                       len, DMA_TO_DEVICE);
++              else
++                      dma_unmap_page(dma_dev, slot->dma_addr,
++                                     len, DMA_TO_DEVICE);
+       }
+ }
+@@ -1583,6 +1657,10 @@ static int bgmac_probe(struct bcma_devic
+               goto err_dma_free;
+       }
++      net_dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
++      net_dev->hw_features = net_dev->features;
++      net_dev->vlan_features = net_dev->features;
++
+       err = register_netdev(bgmac->net_dev);
+       if (err) {
+               bgmac_err(bgmac, "Cannot register net device\n");