67e9d6419bcb42cb3468b3501c47f9d310a3e6a8
[openwrt/openwrt.git] / target / linux / generic / pending-5.15 / 760-net-core-add-optional-threading-for-backlog-processi.patch
1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Thu, 16 Feb 2023 18:39:04 +0100
3 Subject: [PATCH] net/core: add optional threading for backlog processing
4
5 When dealing with few flows or an imbalance on CPU utilization, static RPS
6 CPU assignment can be too inflexible. Add support for enabling threaded NAPI
7 for backlog processing in order to allow the scheduler to better balance
8 processing. This helps better spread the load across idle CPUs.
9
10 Signed-off-by: Felix Fietkau <nbd@nbd.name>
11 ---
12
13 --- a/include/linux/netdevice.h
14 +++ b/include/linux/netdevice.h
15 @@ -502,6 +502,7 @@ static inline bool napi_complete(struct
16 }
17
18 int dev_set_threaded(struct net_device *dev, bool threaded);
19 +int backlog_set_threaded(bool threaded);
20
21 /**
22 * napi_disable - prevent NAPI from scheduling
23 @@ -3365,6 +3366,7 @@ struct softnet_data {
24 unsigned int processed;
25 unsigned int time_squeeze;
26 unsigned int received_rps;
27 + unsigned int process_queue_empty;
28 #ifdef CONFIG_RPS
29 struct softnet_data *rps_ipi_list;
30 #endif
31 --- a/net/core/dev.c
32 +++ b/net/core/dev.c
33 @@ -4576,7 +4576,7 @@ static int rps_ipi_queued(struct softnet
34 #ifdef CONFIG_RPS
35 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
36
37 - if (sd != mysd) {
38 + if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
39 sd->rps_ipi_next = mysd->rps_ipi_list;
40 mysd->rps_ipi_list = sd;
41
42 @@ -5757,6 +5757,8 @@ static DEFINE_PER_CPU(struct work_struct
43 /* Network device is going away, flush any packets still pending */
44 static void flush_backlog(struct work_struct *work)
45 {
46 + unsigned int process_queue_empty;
47 + bool threaded, flush_processq;
48 struct sk_buff *skb, *tmp;
49 struct softnet_data *sd;
50
51 @@ -5772,9 +5774,18 @@ static void flush_backlog(struct work_st
52 input_queue_head_incr(sd);
53 }
54 }
55 +
56 + threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
57 + flush_processq = threaded &&
58 + !skb_queue_empty_lockless(&sd->process_queue);
59 + if (flush_processq)
60 + process_queue_empty = sd->process_queue_empty;
61 rps_unlock(sd);
62 local_irq_enable();
63
64 + if (threaded)
65 + goto out;
66 +
67 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
68 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
69 __skb_unlink(skb, &sd->process_queue);
70 @@ -5782,7 +5793,18 @@ static void flush_backlog(struct work_st
71 input_queue_head_incr(sd);
72 }
73 }
74 +
75 +out:
76 local_bh_enable();
77 +
78 + while (flush_processq) {
79 + msleep(1);
80 + local_irq_disable();
81 + rps_lock(sd);
82 + flush_processq = process_queue_empty == sd->process_queue_empty;
83 + rps_unlock(sd);
84 + local_irq_enable();
85 + }
86 }
87
88 static bool flush_required(int cpu)
89 @@ -6465,6 +6487,7 @@ static int process_backlog(struct napi_s
90
91 local_irq_disable();
92 rps_lock(sd);
93 + sd->process_queue_empty++;
94 if (skb_queue_empty(&sd->input_pkt_queue)) {
95 /*
96 * Inline a custom version of __napi_complete().
97 @@ -6474,7 +6497,8 @@ static int process_backlog(struct napi_s
98 * We can use a plain write instead of clear_bit(),
99 * and we dont need an smp_mb() memory barrier.
100 */
101 - napi->state = 0;
102 + napi->state &= ~(NAPIF_STATE_SCHED |
103 + NAPIF_STATE_SCHED_THREADED);
104 again = false;
105 } else {
106 skb_queue_splice_tail_init(&sd->input_pkt_queue,
107 @@ -6891,6 +6915,57 @@ int dev_set_threaded(struct net_device *
108 }
109 EXPORT_SYMBOL(dev_set_threaded);
110
111 +int backlog_set_threaded(bool threaded)
112 +{
113 + static bool backlog_threaded;
114 + int err = 0;
115 + int i;
116 +
117 + if (backlog_threaded == threaded)
118 + return 0;
119 +
120 + for_each_possible_cpu(i) {
121 + struct softnet_data *sd = &per_cpu(softnet_data, i);
122 + struct napi_struct *n = &sd->backlog;
123 +
124 + if (n->thread)
125 + continue;
126 + n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
127 + if (IS_ERR(n->thread)) {
128 + err = PTR_ERR(n->thread);
129 + pr_err("kthread_run failed with err %d\n", err);
130 + n->thread = NULL;
131 + threaded = false;
132 + break;
133 + }
134 +
135 + }
136 +
137 + backlog_threaded = threaded;
138 +
139 + /* Make sure kthread is created before THREADED bit
140 + * is set.
141 + */
142 + smp_mb__before_atomic();
143 +
144 + for_each_possible_cpu(i) {
145 + struct softnet_data *sd = &per_cpu(softnet_data, i);
146 + struct napi_struct *n = &sd->backlog;
147 + unsigned long flags;
148 +
149 + local_irq_save(flags);
150 + rps_lock(sd);
151 + if (threaded)
152 + n->state |= NAPIF_STATE_THREADED;
153 + else
154 + n->state &= ~NAPIF_STATE_THREADED;
155 + rps_unlock(sd);
156 + local_irq_restore(flags);
157 + }
158 +
159 + return err;
160 +}
161 +
162 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
163 int (*poll)(struct napi_struct *, int), int weight)
164 {
165 @@ -11369,6 +11444,9 @@ static int dev_cpu_dead(unsigned int old
166 raise_softirq_irqoff(NET_TX_SOFTIRQ);
167 local_irq_enable();
168
169 + if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
170 + return 0;
171 +
172 #ifdef CONFIG_RPS
173 remsd = oldsd->rps_ipi_list;
174 oldsd->rps_ipi_list = NULL;
175 @@ -11708,6 +11786,7 @@ static int __init net_dev_init(void)
176 sd->cpu = i;
177 #endif
178
179 + INIT_LIST_HEAD(&sd->backlog.poll_list);
180 init_gro_hash(&sd->backlog);
181 sd->backlog.poll = process_backlog;
182 sd->backlog.weight = weight_p;
183 --- a/net/core/sysctl_net_core.c
184 +++ b/net/core/sysctl_net_core.c
185 @@ -28,6 +28,7 @@ static int int_3600 = 3600;
186 static int min_sndbuf = SOCK_MIN_SNDBUF;
187 static int min_rcvbuf = SOCK_MIN_RCVBUF;
188 static int max_skb_frags = MAX_SKB_FRAGS;
189 +static int backlog_threaded;
190 static long long_one __maybe_unused = 1;
191 static long long_max __maybe_unused = LONG_MAX;
192
193 @@ -114,6 +115,23 @@ static int rps_sock_flow_sysctl(struct c
194 }
195 #endif /* CONFIG_RPS */
196
197 +static int backlog_threaded_sysctl(struct ctl_table *table, int write,
198 + void *buffer, size_t *lenp, loff_t *ppos)
199 +{
200 + static DEFINE_MUTEX(backlog_threaded_mutex);
201 + int ret;
202 +
203 + mutex_lock(&backlog_threaded_mutex);
204 +
205 + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
206 + if (write && !ret)
207 + ret = backlog_set_threaded(backlog_threaded);
208 +
209 + mutex_unlock(&backlog_threaded_mutex);
210 +
211 + return ret;
212 +}
213 +
214 #ifdef CONFIG_NET_FLOW_LIMIT
215 static DEFINE_MUTEX(flow_limit_update_mutex);
216
217 @@ -470,6 +488,15 @@ static struct ctl_table net_core_table[]
218 .proc_handler = rps_sock_flow_sysctl
219 },
220 #endif
221 + {
222 + .procname = "backlog_threaded",
223 + .data = &backlog_threaded,
224 + .maxlen = sizeof(unsigned int),
225 + .mode = 0644,
226 + .proc_handler = backlog_threaded_sysctl,
227 + .extra1 = SYSCTL_ZERO,
228 + .extra2 = SYSCTL_ONE
229 + },
230 #ifdef CONFIG_NET_FLOW_LIMIT
231 {
232 .procname = "flow_limit_cpu_bitmap",