--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
-@@ -1676,6 +1676,10 @@ enum netdev_priv_flags {
+@@ -1677,6 +1677,10 @@ enum netdev_priv_flags {
IFF_TX_SKB_NO_LINEAR = BIT_ULL(31),
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
#define IFF_EBRIDGE IFF_EBRIDGE
#define IFF_BONDING IFF_BONDING
-@@ -1708,6 +1712,7 @@ enum netdev_priv_flags {
+@@ -1709,6 +1713,7 @@ enum netdev_priv_flags {
#define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER
#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK
#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR
/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
-@@ -2009,6 +2014,7 @@ struct net_device {
+@@ -2010,6 +2015,7 @@ struct net_device {
/* Read-mostly cache-line for fast-path access */
unsigned int flags;
unsigned int priv_flags;
const struct net_device_ops *netdev_ops;
int ifindex;
unsigned short gflags;
-@@ -2069,6 +2075,11 @@ struct net_device {
+@@ -2070,6 +2076,11 @@ struct net_device {
const struct tlsdev_ops *tlsdev_ops;
#endif
const struct header_ops *header_ops;
unsigned char operstate;
-@@ -2143,6 +2154,10 @@ struct net_device {
+@@ -2144,6 +2155,10 @@ struct net_device {
struct mctp_dev __rcu *mctp_ptr;
#endif
--- /dev/null
+From: Felix Fietkau <nbd@nbd.name>
+Date: Thu, 16 Feb 2023 18:39:04 +0100
+Subject: [PATCH] net/core: add optional threading for backlog processing
+
+When dealing with few flows or an imbalance on CPU utilization, static RPS
+CPU assignment can be too inflexible. Add support for enabling threaded NAPI
+for backlog processing in order to allow the scheduler to better balance
+processing. This helps better spread the load across idle CPUs.
+
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -500,6 +500,7 @@ static inline bool napi_complete(struct
+ }
+
+ int dev_set_threaded(struct net_device *dev, bool threaded);
++int backlog_set_threaded(bool threaded);
+
+ /**
+ * napi_disable - prevent NAPI from scheduling
+@@ -3363,6 +3364,7 @@ struct softnet_data {
+ unsigned int processed;
+ unsigned int time_squeeze;
+ unsigned int received_rps;
++ unsigned int process_queue_empty;
+ #ifdef CONFIG_RPS
+ struct softnet_data *rps_ipi_list;
+ #endif
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4574,7 +4574,7 @@ static int rps_ipi_queued(struct softnet
+ #ifdef CONFIG_RPS
+ struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+
+- if (sd != mysd) {
++ if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+@@ -5755,6 +5755,8 @@ static DEFINE_PER_CPU(struct work_struct
+ /* Network device is going away, flush any packets still pending */
+ static void flush_backlog(struct work_struct *work)
+ {
++ unsigned int process_queue_empty;
++ bool threaded, flush_processq;
+ struct sk_buff *skb, *tmp;
+ struct softnet_data *sd;
+
+@@ -5770,9 +5772,18 @@ static void flush_backlog(struct work_st
+ input_queue_head_incr(sd);
+ }
+ }
++
++ threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
++ flush_processq = threaded &&
++ !skb_queue_empty_lockless(&sd->process_queue);
++ if (flush_processq)
++ process_queue_empty = sd->process_queue_empty;
+ rps_unlock(sd);
+ local_irq_enable();
+
++ if (threaded)
++ goto out;
++
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ __skb_unlink(skb, &sd->process_queue);
+@@ -5780,7 +5791,18 @@ static void flush_backlog(struct work_st
+ input_queue_head_incr(sd);
+ }
+ }
++
++out:
+ local_bh_enable();
++
++ while (flush_processq) {
++ msleep(1);
++ local_irq_disable();
++ rps_lock(sd);
++ flush_processq = process_queue_empty == sd->process_queue_empty;
++ rps_unlock(sd);
++ local_irq_enable();
++ }
+ }
+
+ static bool flush_required(int cpu)
+@@ -6463,6 +6485,7 @@ static int process_backlog(struct napi_s
+
+ local_irq_disable();
+ rps_lock(sd);
++ sd->process_queue_empty++;
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
+ /*
+ * Inline a custom version of __napi_complete().
+@@ -6472,7 +6495,8 @@ static int process_backlog(struct napi_s
+ * We can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+- napi->state = 0;
++ napi->state &= ~(NAPIF_STATE_SCHED |
++ NAPIF_STATE_SCHED_THREADED);
+ again = false;
+ } else {
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6889,6 +6913,57 @@ int dev_set_threaded(struct net_device *
+ }
+ EXPORT_SYMBOL(dev_set_threaded);
+
++int backlog_set_threaded(bool threaded)
++{
++ static bool backlog_threaded;
++ int err = 0;
++ int i;
++
++ if (backlog_threaded == threaded)
++ return 0;
++
++ for_each_possible_cpu(i) {
++ struct softnet_data *sd = &per_cpu(softnet_data, i);
++ struct napi_struct *n = &sd->backlog;
++
++ if (n->thread)
++ continue;
++ n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
++ if (IS_ERR(n->thread)) {
++ err = PTR_ERR(n->thread);
++ pr_err("kthread_run failed with err %d\n", err);
++ n->thread = NULL;
++ threaded = false;
++ break;
++ }
++
++ }
++
++ backlog_threaded = threaded;
++
++ /* Make sure kthread is created before THREADED bit
++ * is set.
++ */
++ smp_mb__before_atomic();
++
++ for_each_possible_cpu(i) {
++ struct softnet_data *sd = &per_cpu(softnet_data, i);
++ struct napi_struct *n = &sd->backlog;
++ unsigned long flags;
++
++ local_irq_save(flags);
++ rps_lock(sd);
++ if (threaded)
++ n->state |= NAPIF_STATE_THREADED;
++ else
++ n->state &= ~NAPIF_STATE_THREADED;
++ rps_unlock(sd);
++ local_irq_restore(flags);
++ }
++
++ return err;
++}
++
+ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int weight)
+ {
+@@ -11367,6 +11442,9 @@ static int dev_cpu_dead(unsigned int old
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_enable();
+
++ if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
++ return 0;
++
+ #ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+--- a/net/core/sysctl_net_core.c
++++ b/net/core/sysctl_net_core.c
+@@ -28,6 +28,7 @@ static int int_3600 = 3600;
+ static int min_sndbuf = SOCK_MIN_SNDBUF;
+ static int min_rcvbuf = SOCK_MIN_RCVBUF;
+ static int max_skb_frags = MAX_SKB_FRAGS;
++static int backlog_threaded;
+ static long long_one __maybe_unused = 1;
+ static long long_max __maybe_unused = LONG_MAX;
+
+@@ -114,6 +115,23 @@ static int rps_sock_flow_sysctl(struct c
+ }
+ #endif /* CONFIG_RPS */
+
++static int backlog_threaded_sysctl(struct ctl_table *table, int write,
++ void *buffer, size_t *lenp, loff_t *ppos)
++{
++ static DEFINE_MUTEX(backlog_threaded_mutex);
++ int ret;
++
++ mutex_lock(&backlog_threaded_mutex);
++
++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
++ if (write && !ret)
++ ret = backlog_set_threaded(backlog_threaded);
++
++ mutex_unlock(&backlog_threaded_mutex);
++
++ return ret;
++}
++
+ #ifdef CONFIG_NET_FLOW_LIMIT
+ static DEFINE_MUTEX(flow_limit_update_mutex);
+
+@@ -470,6 +488,15 @@ static struct ctl_table net_core_table[]
+ .proc_handler = rps_sock_flow_sysctl
+ },
+ #endif
++ {
++ .procname = "backlog_threaded",
++ .data = &backlog_threaded,
++ .maxlen = sizeof(unsigned int),
++ .mode = 0644,
++ .proc_handler = backlog_threaded_sysctl,
++ .extra1 = SYSCTL_ZERO,
++ .extra2 = SYSCTL_ONE
++ },
+ #ifdef CONFIG_NET_FLOW_LIMIT
+ {
+ .procname = "flow_limit_cpu_bitmap",