1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Thu, 16 Feb 2023 18:39:04 +0100
3 Subject: [PATCH] net/core: add optional threading for backlog processing
5 When dealing with few flows or an imbalance on CPU utilization, static RPS
6 CPU assignment can be too inflexible. Add support for enabling threaded NAPI
7 for backlog processing in order to allow the scheduler to better balance
8 processing. This helps better spread the load across idle CPUs.
10 Signed-off-by: Felix Fietkau <nbd@nbd.name>
13 --- a/include/linux/netdevice.h
14 +++ b/include/linux/netdevice.h
15 @@ -558,6 +558,7 @@ static inline bool napi_complete(struct
18 int dev_set_threaded(struct net_device *dev, bool threaded);
19 +int backlog_set_threaded(bool threaded);
22 * napi_disable - prevent NAPI from scheduling
23 @@ -3238,6 +3239,7 @@ struct softnet_data {
25 unsigned int processed;
26 unsigned int time_squeeze;
27 + unsigned int process_queue_empty;
29 struct softnet_data *rps_ipi_list;
33 @@ -4729,7 +4729,7 @@ static void napi_schedule_rps(struct sof
34 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
38 + if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
39 sd->rps_ipi_next = mysd->rps_ipi_list;
40 mysd->rps_ipi_list = sd;
42 @@ -5848,6 +5848,8 @@ static DEFINE_PER_CPU(struct work_struct
43 /* Network device is going away, flush any packets still pending */
44 static void flush_backlog(struct work_struct *work)
46 + unsigned int process_queue_empty;
47 + bool threaded, flush_processq;
48 struct sk_buff *skb, *tmp;
49 struct softnet_data *sd;
51 @@ -5862,8 +5864,17 @@ static void flush_backlog(struct work_st
52 input_queue_head_incr(sd);
56 + threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
57 + flush_processq = threaded &&
58 + !skb_queue_empty_lockless(&sd->process_queue);
60 + process_queue_empty = sd->process_queue_empty;
61 rps_unlock_irq_enable(sd);
66 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
67 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
68 __skb_unlink(skb, &sd->process_queue);
69 @@ -5871,7 +5882,16 @@ static void flush_backlog(struct work_st
70 input_queue_head_incr(sd);
77 + while (flush_processq) {
79 + rps_lock_irq_disable(sd);
80 + flush_processq = process_queue_empty == sd->process_queue_empty;
81 + rps_unlock_irq_enable(sd);
85 static bool flush_required(int cpu)
86 @@ -6003,6 +6023,7 @@ static int process_backlog(struct napi_s
89 rps_lock_irq_disable(sd);
90 + sd->process_queue_empty++;
91 if (skb_queue_empty(&sd->input_pkt_queue)) {
93 * Inline a custom version of __napi_complete().
94 @@ -6012,7 +6033,8 @@ static int process_backlog(struct napi_s
95 * We can use a plain write instead of clear_bit(),
96 * and we dont need an smp_mb() memory barrier.
99 + napi->state &= ~(NAPIF_STATE_SCHED |
100 + NAPIF_STATE_SCHED_THREADED);
103 skb_queue_splice_tail_init(&sd->input_pkt_queue,
104 @@ -6426,6 +6448,55 @@ int dev_set_threaded(struct net_device *
106 EXPORT_SYMBOL(dev_set_threaded);
108 +int backlog_set_threaded(bool threaded)
110 + static bool backlog_threaded;
114 + if (backlog_threaded == threaded)
117 + for_each_possible_cpu(i) {
118 + struct softnet_data *sd = &per_cpu(softnet_data, i);
119 + struct napi_struct *n = &sd->backlog;
123 + n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
124 + if (IS_ERR(n->thread)) {
125 + err = PTR_ERR(n->thread);
126 + pr_err("kthread_run failed with err %d\n", err);
134 + backlog_threaded = threaded;
136 + /* Make sure kthread is created before THREADED bit
139 + smp_mb__before_atomic();
141 + for_each_possible_cpu(i) {
142 + struct softnet_data *sd = &per_cpu(softnet_data, i);
143 + struct napi_struct *n = &sd->backlog;
144 + unsigned long flags;
146 + rps_lock_irqsave(sd, &flags);
148 + n->state |= NAPIF_STATE_THREADED;
150 + n->state &= ~NAPIF_STATE_THREADED;
151 + rps_unlock_irq_restore(sd, &flags);
157 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
158 int (*poll)(struct napi_struct *, int), int weight)
160 @@ -11351,6 +11422,9 @@ static int dev_cpu_dead(unsigned int old
161 raise_softirq_irqoff(NET_TX_SOFTIRQ);
164 + if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
168 remsd = oldsd->rps_ipi_list;
169 oldsd->rps_ipi_list = NULL;
170 @@ -11666,6 +11740,7 @@ static int __init net_dev_init(void)
171 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
172 spin_lock_init(&sd->defer_lock);
174 + INIT_LIST_HEAD(&sd->backlog.poll_list);
175 init_gro_hash(&sd->backlog);
176 sd->backlog.poll = process_backlog;
177 sd->backlog.weight = weight_p;
178 --- a/net/core/sysctl_net_core.c
179 +++ b/net/core/sysctl_net_core.c
180 @@ -30,6 +30,7 @@ static int int_3600 = 3600;
181 static int min_sndbuf = SOCK_MIN_SNDBUF;
182 static int min_rcvbuf = SOCK_MIN_RCVBUF;
183 static int max_skb_frags = MAX_SKB_FRAGS;
184 +static int backlog_threaded;
186 static int net_msg_warn; /* Unused, but still a sysctl */
188 @@ -188,6 +189,23 @@ static int rps_sock_flow_sysctl(struct c
190 #endif /* CONFIG_RPS */
192 +static int backlog_threaded_sysctl(struct ctl_table *table, int write,
193 + void *buffer, size_t *lenp, loff_t *ppos)
195 + static DEFINE_MUTEX(backlog_threaded_mutex);
198 + mutex_lock(&backlog_threaded_mutex);
200 + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
202 + ret = backlog_set_threaded(backlog_threaded);
204 + mutex_unlock(&backlog_threaded_mutex);
209 #ifdef CONFIG_NET_FLOW_LIMIT
210 static DEFINE_MUTEX(flow_limit_update_mutex);
212 @@ -532,6 +550,15 @@ static struct ctl_table net_core_table[]
213 .proc_handler = rps_sock_flow_sysctl
217 + .procname = "backlog_threaded",
218 + .data = &backlog_threaded,
219 + .maxlen = sizeof(unsigned int),
221 + .proc_handler = backlog_threaded_sysctl,
222 + .extra1 = SYSCTL_ZERO,
223 + .extra2 = SYSCTL_ONE
225 #ifdef CONFIG_NET_FLOW_LIMIT
227 .procname = "flow_limit_cpu_bitmap",