kernel: add support for threaded network backlog processing
authorFelix Fietkau <nbd@nbd.name>
Sun, 19 Feb 2023 14:01:43 +0000 (15:01 +0100)
committerFelix Fietkau <nbd@nbd.name>
Fri, 24 Mar 2023 17:22:39 +0000 (18:22 +0100)
This can improve load balancing by pushing backlog (and RPS) processing
to separate threads, allowing the scheduler to distribute the load.
It can be enabled with: echo 1 > /proc/sys/net/core/backlog_threaded

Signed-off-by: Felix Fietkau <nbd@nbd.name>
target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch
target/linux/generic/pending-5.15/760-net-core-add-optional-threading-for-backlog-processi.patch [new file with mode: 0644]

index 9dc86303a7aa83b07ec9d7d9ca377e92f03a030d..a1d621a7a9e51957593b5aeecfb86e324f2eb52d 100644 (file)
@@ -19,7 +19,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 
 --- a/include/linux/netdevice.h
 +++ b/include/linux/netdevice.h
-@@ -1676,6 +1676,10 @@ enum netdev_priv_flags {
+@@ -1677,6 +1677,10 @@ enum netdev_priv_flags {
        IFF_TX_SKB_NO_LINEAR            = BIT_ULL(31),
  };
  
@@ -30,7 +30,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
  #define IFF_802_1Q_VLAN                       IFF_802_1Q_VLAN
  #define IFF_EBRIDGE                   IFF_EBRIDGE
  #define IFF_BONDING                   IFF_BONDING
-@@ -1708,6 +1712,7 @@ enum netdev_priv_flags {
+@@ -1709,6 +1713,7 @@ enum netdev_priv_flags {
  #define IFF_L3MDEV_RX_HANDLER         IFF_L3MDEV_RX_HANDLER
  #define IFF_LIVE_RENAME_OK            IFF_LIVE_RENAME_OK
  #define IFF_TX_SKB_NO_LINEAR          IFF_TX_SKB_NO_LINEAR
@@ -38,7 +38,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
  
  /* Specifies the type of the struct net_device::ml_priv pointer */
  enum netdev_ml_priv_type {
-@@ -2009,6 +2014,7 @@ struct net_device {
+@@ -2010,6 +2015,7 @@ struct net_device {
        /* Read-mostly cache-line for fast-path access */
        unsigned int            flags;
        unsigned int            priv_flags;
@@ -46,7 +46,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
        const struct net_device_ops *netdev_ops;
        int                     ifindex;
        unsigned short          gflags;
-@@ -2069,6 +2075,11 @@ struct net_device {
+@@ -2070,6 +2076,11 @@ struct net_device {
        const struct tlsdev_ops *tlsdev_ops;
  #endif
  
@@ -58,7 +58,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
        const struct header_ops *header_ops;
  
        unsigned char           operstate;
-@@ -2143,6 +2154,10 @@ struct net_device {
+@@ -2144,6 +2155,10 @@ struct net_device {
        struct mctp_dev __rcu   *mctp_ptr;
  #endif
  
diff --git a/target/linux/generic/pending-5.15/760-net-core-add-optional-threading-for-backlog-processi.patch b/target/linux/generic/pending-5.15/760-net-core-add-optional-threading-for-backlog-processi.patch
new file mode 100644 (file)
index 0000000..463f405
--- /dev/null
@@ -0,0 +1,224 @@
+From: Felix Fietkau <nbd@nbd.name>
+Date: Thu, 16 Feb 2023 18:39:04 +0100
+Subject: [PATCH] net/core: add optional threading for backlog processing
+
+When dealing with few flows or an imbalance on CPU utilization, static RPS
+CPU assignment can be too inflexible. Add support for enabling threaded NAPI
+for backlog processing in order to allow the scheduler to better balance
+processing. This helps better spread the load across idle CPUs.
+
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -500,6 +500,7 @@ static inline bool napi_complete(struct
+ }
+ int dev_set_threaded(struct net_device *dev, bool threaded);
++int backlog_set_threaded(bool threaded);
+ /**
+  *    napi_disable - prevent NAPI from scheduling
+@@ -3363,6 +3364,7 @@ struct softnet_data {
+       unsigned int            processed;
+       unsigned int            time_squeeze;
+       unsigned int            received_rps;
++      unsigned int            process_queue_empty;
+ #ifdef CONFIG_RPS
+       struct softnet_data     *rps_ipi_list;
+ #endif
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4574,7 +4574,7 @@ static int rps_ipi_queued(struct softnet
+ #ifdef CONFIG_RPS
+       struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+-      if (sd != mysd) {
++      if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
+               sd->rps_ipi_next = mysd->rps_ipi_list;
+               mysd->rps_ipi_list = sd;
+@@ -5755,6 +5755,8 @@ static DEFINE_PER_CPU(struct work_struct
+ /* Network device is going away, flush any packets still pending */
+ static void flush_backlog(struct work_struct *work)
+ {
++      unsigned int process_queue_empty;
++      bool threaded, flush_processq;
+       struct sk_buff *skb, *tmp;
+       struct softnet_data *sd;
+@@ -5770,9 +5772,18 @@ static void flush_backlog(struct work_st
+                       input_queue_head_incr(sd);
+               }
+       }
++
++      threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
++      flush_processq = threaded &&
++                       !skb_queue_empty_lockless(&sd->process_queue);
++      if (flush_processq)
++              process_queue_empty = sd->process_queue_empty;
+       rps_unlock(sd);
+       local_irq_enable();
++      if (threaded)
++              goto out;
++
+       skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+               if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+                       __skb_unlink(skb, &sd->process_queue);
+@@ -5780,7 +5791,18 @@ static void flush_backlog(struct work_st
+                       input_queue_head_incr(sd);
+               }
+       }
++
++out:
+       local_bh_enable();
++
++      while (flush_processq) {
++              msleep(1);
++              local_irq_disable();
++              rps_lock(sd);
++              flush_processq = process_queue_empty == sd->process_queue_empty;
++              rps_unlock(sd);
++              local_irq_enable();
++      }
+ }
+ static bool flush_required(int cpu)
+@@ -6463,6 +6485,7 @@ static int process_backlog(struct napi_s
+               local_irq_disable();
+               rps_lock(sd);
++              sd->process_queue_empty++;
+               if (skb_queue_empty(&sd->input_pkt_queue)) {
+                       /*
+                        * Inline a custom version of __napi_complete().
+@@ -6472,7 +6495,8 @@ static int process_backlog(struct napi_s
+                        * We can use a plain write instead of clear_bit(),
+                        * and we dont need an smp_mb() memory barrier.
+                        */
+-                      napi->state = 0;
++                      napi->state &= ~(NAPIF_STATE_SCHED |
++                                       NAPIF_STATE_SCHED_THREADED);
+                       again = false;
+               } else {
+                       skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6889,6 +6913,57 @@ int dev_set_threaded(struct net_device *
+ }
+ EXPORT_SYMBOL(dev_set_threaded);
++int backlog_set_threaded(bool threaded)
++{
++      static bool backlog_threaded;
++      int err = 0;
++      int i;
++
++      if (backlog_threaded == threaded)
++              return 0;
++
++      for_each_possible_cpu(i) {
++              struct softnet_data *sd = &per_cpu(softnet_data, i);
++              struct napi_struct *n = &sd->backlog;
++
++              if (n->thread)
++                      continue;
++              n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
++              if (IS_ERR(n->thread)) {
++                      err = PTR_ERR(n->thread);
++                      pr_err("kthread_run failed with err %d\n", err);
++                      n->thread = NULL;
++                      threaded = false;
++                      break;
++              }
++
++      }
++
++      backlog_threaded = threaded;
++
++      /* Make sure kthread is created before THREADED bit
++       * is set.
++       */
++      smp_mb__before_atomic();
++
++      for_each_possible_cpu(i) {
++              struct softnet_data *sd = &per_cpu(softnet_data, i);
++              struct napi_struct *n = &sd->backlog;
++              unsigned long flags;
++
++              local_irq_save(flags);
++              rps_lock(sd);
++              if (threaded)
++                      n->state |= NAPIF_STATE_THREADED;
++              else
++                      n->state &= ~NAPIF_STATE_THREADED;
++              rps_unlock(sd);
++              local_irq_restore(flags);
++      }
++
++      return err;
++}
++
+ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+                   int (*poll)(struct napi_struct *, int), int weight)
+ {
+@@ -11367,6 +11442,9 @@ static int dev_cpu_dead(unsigned int old
+       raise_softirq_irqoff(NET_TX_SOFTIRQ);
+       local_irq_enable();
++      if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
++              return 0;
++
+ #ifdef CONFIG_RPS
+       remsd = oldsd->rps_ipi_list;
+       oldsd->rps_ipi_list = NULL;
+--- a/net/core/sysctl_net_core.c
++++ b/net/core/sysctl_net_core.c
+@@ -28,6 +28,7 @@ static int int_3600 = 3600;
+ static int min_sndbuf = SOCK_MIN_SNDBUF;
+ static int min_rcvbuf = SOCK_MIN_RCVBUF;
+ static int max_skb_frags = MAX_SKB_FRAGS;
++static int backlog_threaded;
+ static long long_one __maybe_unused = 1;
+ static long long_max __maybe_unused = LONG_MAX;
+@@ -114,6 +115,23 @@ static int rps_sock_flow_sysctl(struct c
+ }
+ #endif /* CONFIG_RPS */
++static int backlog_threaded_sysctl(struct ctl_table *table, int write,
++                             void *buffer, size_t *lenp, loff_t *ppos)
++{
++      static DEFINE_MUTEX(backlog_threaded_mutex);
++      int ret;
++
++      mutex_lock(&backlog_threaded_mutex);
++
++      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
++      if (write && !ret)
++              ret = backlog_set_threaded(backlog_threaded);
++
++      mutex_unlock(&backlog_threaded_mutex);
++
++      return ret;
++}
++
+ #ifdef CONFIG_NET_FLOW_LIMIT
+ static DEFINE_MUTEX(flow_limit_update_mutex);
+@@ -470,6 +488,15 @@ static struct ctl_table net_core_table[]
+               .proc_handler   = rps_sock_flow_sysctl
+       },
+ #endif
++      {
++              .procname       = "backlog_threaded",
++              .data           = &backlog_threaded,
++              .maxlen         = sizeof(unsigned int),
++              .mode           = 0644,
++              .proc_handler   = backlog_threaded_sysctl,
++              .extra1         = SYSCTL_ZERO,
++              .extra2         = SYSCTL_ONE
++      },
+ #ifdef CONFIG_NET_FLOW_LIMIT
+       {
+               .procname       = "flow_limit_cpu_bitmap",