kernel: bump 5.4 to 5.4.114
[openwrt/openwrt.git] / target / linux / generic / pending-5.4 / 690-net-add-support-for-threaded-NAPI-polling.patch
1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Sun, 26 Jul 2020 14:03:21 +0200
3 Subject: [PATCH] net: add support for threaded NAPI polling
4
5 For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
6 poll function does not perform well. Since NAPI poll is bound to the CPU it
7 was scheduled from, we can easily end up with a few very busy CPUs spending
8 most of their time in softirq/ksoftirqd and some idle ones.
9
10 Introduce threaded NAPI for such drivers based on a workqueue. The API is the
11 same except for using netif_threaded_napi_add instead of netif_napi_add.
12
13 In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
14 improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
15 NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
16 thread.
17
18 With threaded NAPI it seems stable and consistent (and higher than the best
19 results I got without it).
20
21 Based on a patch by Hillf Danton
22
23 Cc: Hillf Danton <hdanton@sina.com>
24 Signed-off-by: Felix Fietkau <nbd@nbd.name>
25 ---
26
27 --- a/include/linux/netdevice.h
28 +++ b/include/linux/netdevice.h
29 @@ -340,6 +340,7 @@ struct napi_struct {
30 struct list_head dev_list;
31 struct hlist_node napi_hash_node;
32 unsigned int napi_id;
33 + struct work_struct work;
34 };
35
36 enum {
37 @@ -350,6 +351,7 @@ enum {
38 NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
39 NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
40 NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
41 + NAPI_STATE_THREADED, /* Use threaded NAPI */
42 };
43
44 enum {
45 @@ -360,6 +362,7 @@ enum {
46 NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
47 NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
48 NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
49 + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
50 };
51
52 enum gro_result {
53 @@ -2101,6 +2104,7 @@ struct net_device {
54 struct lock_class_key addr_list_lock_key;
55 bool proto_down;
56 unsigned wol_enabled:1;
57 + unsigned threaded:1;
58 };
59 #define to_net_dev(d) container_of(d, struct net_device, dev)
60
61 @@ -2281,6 +2285,26 @@ void netif_napi_add(struct net_device *d
62 int (*poll)(struct napi_struct *, int), int weight);
63
64 /**
65 + * netif_threaded_napi_add - initialize a NAPI context
66 + * @dev: network device
67 + * @napi: NAPI context
68 + * @poll: polling function
69 + * @weight: default weight
70 + *
71 + * This variant of netif_napi_add() should be used from drivers using NAPI
72 + * with CPU intensive poll functions.
73 + * This will schedule polling from a high priority workqueue
74 + */
75 +static inline void netif_threaded_napi_add(struct net_device *dev,
76 + struct napi_struct *napi,
77 + int (*poll)(struct napi_struct *, int),
78 + int weight)
79 +{
80 + set_bit(NAPI_STATE_THREADED, &napi->state);
81 + netif_napi_add(dev, napi, poll, weight);
82 +}
83 +
84 +/**
85 * netif_tx_napi_add - initialize a NAPI context
86 * @dev: network device
87 * @napi: NAPI context
88 --- a/net/core/dev.c
89 +++ b/net/core/dev.c
90 @@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock);
91 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
92 struct list_head ptype_all __read_mostly; /* Taps */
93 static struct list_head offload_base __read_mostly;
94 +static struct workqueue_struct *napi_workq __read_mostly;
95
96 static int netif_rx_internal(struct sk_buff *skb);
97 static int call_netdevice_notifiers_info(unsigned long val,
98 @@ -5912,6 +5913,11 @@ void __napi_schedule(struct napi_struct
99 {
100 unsigned long flags;
101
102 + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
103 + queue_work(napi_workq, &n->work);
104 + return;
105 + }
106 +
107 local_irq_save(flags);
108 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
109 local_irq_restore(flags);
110 @@ -5959,6 +5965,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
111 */
112 void __napi_schedule_irqoff(struct napi_struct *n)
113 {
114 + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
115 + queue_work(napi_workq, &n->work);
116 + return;
117 + }
118 +
119 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
120 }
121 EXPORT_SYMBOL(__napi_schedule_irqoff);
122 @@ -6220,9 +6231,89 @@ static void init_gro_hash(struct napi_st
123 napi->gro_bitmask = 0;
124 }
125
126 +static int __napi_poll(struct napi_struct *n, bool *repoll)
127 +{
128 + int work, weight;
129 +
130 + weight = n->weight;
131 +
132 + /* This NAPI_STATE_SCHED test is for avoiding a race
133 + * with netpoll's poll_napi(). Only the entity which
134 + * obtains the lock and sees NAPI_STATE_SCHED set will
135 + * actually make the ->poll() call. Therefore we avoid
136 + * accidentally calling ->poll() when NAPI is not scheduled.
137 + */
138 + work = 0;
139 + if (test_bit(NAPI_STATE_SCHED, &n->state)) {
140 + work = n->poll(n, weight);
141 + trace_napi_poll(n, work, weight);
142 + }
143 +
144 + WARN_ON_ONCE(work > weight);
145 +
146 + if (likely(work < weight))
147 + return work;
148 +
149 + /* Drivers must not modify the NAPI state if they
150 + * consume the entire weight. In such cases this code
151 + * still "owns" the NAPI instance and therefore can
152 + * move the instance around on the list at-will.
153 + */
154 + if (unlikely(napi_disable_pending(n))) {
155 + napi_complete(n);
156 + return work;
157 + }
158 +
159 + if (n->gro_bitmask) {
160 + /* flush too old packets
161 + * If HZ < 1000, flush all packets.
162 + */
163 + napi_gro_flush(n, HZ >= 1000);
164 + }
165 +
166 + gro_normal_list(n);
167 +
168 + *repoll = true;
169 +
170 + return work;
171 +}
172 +
173 +static void napi_workfn(struct work_struct *work)
174 +{
175 + struct napi_struct *n = container_of(work, struct napi_struct, work);
176 + void *have;
177 +
178 + for (;;) {
179 + bool repoll = false;
180 +
181 + local_bh_disable();
182 +
183 + have = netpoll_poll_lock(n);
184 + __napi_poll(n, &repoll);
185 + netpoll_poll_unlock(have);
186 +
187 + local_bh_enable();
188 +
189 + if (!repoll)
190 + return;
191 +
192 + if (!need_resched())
193 + continue;
194 +
195 + /*
196 + * have to pay for the latency of task switch even if
197 + * napi is scheduled
198 + */
199 + queue_work(napi_workq, work);
200 + return;
201 + }
202 +}
203 +
204 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
205 int (*poll)(struct napi_struct *, int), int weight)
206 {
207 + if (dev->threaded)
208 + set_bit(NAPI_STATE_THREADED, &napi->state);
209 INIT_LIST_HEAD(&napi->poll_list);
210 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
211 napi->timer.function = napi_watchdog;
212 @@ -6239,6 +6330,7 @@ void netif_napi_add(struct net_device *d
213 #ifdef CONFIG_NETPOLL
214 napi->poll_owner = -1;
215 #endif
216 + INIT_WORK(&napi->work, napi_workfn);
217 set_bit(NAPI_STATE_SCHED, &napi->state);
218 set_bit(NAPI_STATE_NPSVC, &napi->state);
219 list_add_rcu(&napi->dev_list, &dev->napi_list);
220 @@ -6279,6 +6371,7 @@ static void flush_gro_hash(struct napi_s
221 void netif_napi_del(struct napi_struct *napi)
222 {
223 might_sleep();
224 + cancel_work_sync(&napi->work);
225 if (napi_hash_del(napi))
226 synchronize_net();
227 list_del_init(&napi->dev_list);
228 @@ -6291,50 +6384,18 @@ EXPORT_SYMBOL(netif_napi_del);
229
230 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
231 {
232 + bool do_repoll = false;
233 void *have;
234 - int work, weight;
235 + int work;
236
237 list_del_init(&n->poll_list);
238
239 have = netpoll_poll_lock(n);
240
241 - weight = n->weight;
242 + work = __napi_poll(n, &do_repoll);
243
244 - /* This NAPI_STATE_SCHED test is for avoiding a race
245 - * with netpoll's poll_napi(). Only the entity which
246 - * obtains the lock and sees NAPI_STATE_SCHED set will
247 - * actually make the ->poll() call. Therefore we avoid
248 - * accidentally calling ->poll() when NAPI is not scheduled.
249 - */
250 - work = 0;
251 - if (test_bit(NAPI_STATE_SCHED, &n->state)) {
252 - work = n->poll(n, weight);
253 - trace_napi_poll(n, work, weight);
254 - }
255 -
256 - WARN_ON_ONCE(work > weight);
257 -
258 - if (likely(work < weight))
259 - goto out_unlock;
260 -
261 - /* Drivers must not modify the NAPI state if they
262 - * consume the entire weight. In such cases this code
263 - * still "owns" the NAPI instance and therefore can
264 - * move the instance around on the list at-will.
265 - */
266 - if (unlikely(napi_disable_pending(n))) {
267 - napi_complete(n);
268 + if (!do_repoll)
269 goto out_unlock;
270 - }
271 -
272 - if (n->gro_bitmask) {
273 - /* flush too old packets
274 - * If HZ < 1000, flush all packets.
275 - */
276 - napi_gro_flush(n, HZ >= 1000);
277 - }
278 -
279 - gro_normal_list(n);
280
281 /* Some drivers may have called napi_schedule
282 * prior to exhausting their budget.
283 @@ -10314,6 +10375,10 @@ static int __init net_dev_init(void)
284 sd->backlog.weight = weight_p;
285 }
286
287 + napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
288 + WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
289 + BUG_ON(!napi_workq);
290 +
291 dev_boot_phase = 0;
292
293 /* The loopback device is special if any other network devices
294 --- a/net/core/net-sysfs.c
295 +++ b/net/core/net-sysfs.c
296 @@ -442,6 +442,52 @@ static ssize_t proto_down_store(struct d
297 }
298 NETDEVICE_SHOW_RW(proto_down, fmt_dec);
299
300 +static int change_napi_threaded(struct net_device *dev, unsigned long val)
301 +{
302 + struct napi_struct *napi;
303 +
304 + if (list_empty(&dev->napi_list))
305 + return -EOPNOTSUPP;
306 +
307 + list_for_each_entry(napi, &dev->napi_list, dev_list) {
308 + if (val)
309 + set_bit(NAPI_STATE_THREADED, &napi->state);
310 + else
311 + clear_bit(NAPI_STATE_THREADED, &napi->state);
312 + }
313 +
314 + return 0;
315 +}
316 +
317 +static ssize_t napi_threaded_store(struct device *dev,
318 + struct device_attribute *attr,
319 + const char *buf, size_t len)
320 +{
321 + return netdev_store(dev, attr, buf, len, change_napi_threaded);
322 +}
323 +
324 +static ssize_t napi_threaded_show(struct device *dev,
325 + struct device_attribute *attr,
326 + char *buf)
327 +{
328 + struct net_device *netdev = to_net_dev(dev);
329 + struct napi_struct *napi;
330 + bool enabled = false;
331 +
332 + if (!rtnl_trylock())
333 + return restart_syscall();
334 +
335 + list_for_each_entry(napi, &netdev->napi_list, dev_list) {
336 + if (test_bit(NAPI_STATE_THREADED, &napi->state))
337 + enabled = true;
338 + }
339 +
340 + rtnl_unlock();
341 +
342 + return sprintf(buf, fmt_dec, enabled);
343 +}
344 +static DEVICE_ATTR_RW(napi_threaded);
345 +
346 static ssize_t phys_port_id_show(struct device *dev,
347 struct device_attribute *attr, char *buf)
348 {
349 @@ -532,6 +578,7 @@ static struct attribute *net_class_attrs
350 &dev_attr_flags.attr,
351 &dev_attr_tx_queue_len.attr,
352 &dev_attr_gro_flush_timeout.attr,
353 + &dev_attr_napi_threaded.attr,
354 &dev_attr_phys_port_id.attr,
355 &dev_attr_phys_port_name.attr,
356 &dev_attr_phys_switch_id.attr,