kernel: 5.10: refresh patches
[openwrt/openwrt.git] / target / linux / generic / pending-5.10 / 690-net-add-support-for-threaded-NAPI-polling.patch
1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Sun, 26 Jul 2020 14:03:21 +0200
3 Subject: [PATCH] net: add support for threaded NAPI polling
4
5 For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
6 poll function does not perform well. Since NAPI poll is bound to the CPU it
7 was scheduled from, we can easily end up with a few very busy CPUs spending
8 most of their time in softirq/ksoftirqd and some idle ones.
9
10 Introduce threaded NAPI for such drivers based on a workqueue. The API is the
11 same except for using netif_threaded_napi_add instead of netif_napi_add.
12
13 In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
14 improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
15 NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
16 thread.
17
18 With threaded NAPI, throughput seems stable and consistent (and higher than
19 the best results I got without it).
20
21 Based on a patch by Hillf Danton
22
23 Cc: Hillf Danton <hdanton@sina.com>
24 Signed-off-by: Felix Fietkau <nbd@nbd.name>
25 ---
26
27 --- a/include/linux/netdevice.h
28 +++ b/include/linux/netdevice.h
29 @@ -347,6 +347,7 @@ struct napi_struct {
30 struct list_head dev_list;
31 struct hlist_node napi_hash_node;
32 unsigned int napi_id;
33 + struct work_struct work;
34 };
35
36 enum {
37 @@ -357,6 +358,7 @@ enum {
38 NAPI_STATE_LISTED, /* NAPI added to system lists */
39 NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
40 NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
41 + NAPI_STATE_THREADED, /* Use threaded NAPI */
42 };
43
44 enum {
45 @@ -367,6 +369,7 @@ enum {
46 NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
47 NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
48 NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
49 + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
50 };
51
52 enum gro_result {
53 @@ -2411,6 +2414,26 @@ void netif_napi_add(struct net_device *d
54 int (*poll)(struct napi_struct *, int), int weight);
55
56 /**
57 + * netif_threaded_napi_add - initialize a NAPI context
58 + * @dev: network device
59 + * @napi: NAPI context
60 + * @poll: polling function
61 + * @weight: default weight
62 + *
63 + * This variant of netif_napi_add() should be used from drivers using NAPI
64 + * with CPU intensive poll functions.
65 + * This will schedule polling from a high priority workqueue
66 + */
67 +static inline void netif_threaded_napi_add(struct net_device *dev,
68 + struct napi_struct *napi,
69 + int (*poll)(struct napi_struct *, int),
70 + int weight)
71 +{
72 + set_bit(NAPI_STATE_THREADED, &napi->state);
73 + netif_napi_add(dev, napi, poll, weight);
74 +}
75 +
76 +/**
77 * netif_tx_napi_add - initialize a NAPI context
78 * @dev: network device
79 * @napi: NAPI context
80 --- a/net/core/dev.c
81 +++ b/net/core/dev.c
82 @@ -159,6 +159,7 @@ static DEFINE_SPINLOCK(offload_lock);
83 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
84 struct list_head ptype_all __read_mostly; /* Taps */
85 static struct list_head offload_base __read_mostly;
86 +static struct workqueue_struct *napi_workq __read_mostly;
87
88 static int netif_rx_internal(struct sk_buff *skb);
89 static int call_netdevice_notifiers_info(unsigned long val,
90 @@ -6404,6 +6405,11 @@ void __napi_schedule(struct napi_struct
91 {
92 unsigned long flags;
93
94 + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
95 + queue_work(napi_workq, &n->work);
96 + return;
97 + }
98 +
99 local_irq_save(flags);
100 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
101 local_irq_restore(flags);
102 @@ -6451,6 +6457,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
103 */
104 void __napi_schedule_irqoff(struct napi_struct *n)
105 {
106 + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
107 + queue_work(napi_workq, &n->work);
108 + return;
109 + }
110 +
111 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
112 }
113 EXPORT_SYMBOL(__napi_schedule_irqoff);
114 @@ -6712,6 +6723,86 @@ static void init_gro_hash(struct napi_st
115 napi->gro_bitmask = 0;
116 }
117
118 +static int __napi_poll(struct napi_struct *n, bool *repoll)
119 +{
120 + int work, weight;
121 +
122 + weight = n->weight;
123 +
124 + /* This NAPI_STATE_SCHED test is for avoiding a race
125 + * with netpoll's poll_napi(). Only the entity which
126 + * obtains the lock and sees NAPI_STATE_SCHED set will
127 + * actually make the ->poll() call. Therefore we avoid
128 + * accidentally calling ->poll() when NAPI is not scheduled.
129 + */
130 + work = 0;
131 + if (test_bit(NAPI_STATE_SCHED, &n->state)) {
132 + work = n->poll(n, weight);
133 + trace_napi_poll(n, work, weight);
134 + }
135 +
136 + if (unlikely(work > weight))
137 + pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
138 + n->poll, work, weight);
139 +
140 + if (likely(work < weight))
141 + return work;
142 +
143 + /* Drivers must not modify the NAPI state if they
144 + * consume the entire weight. In such cases this code
145 + * still "owns" the NAPI instance and therefore can
146 + * move the instance around on the list at-will.
147 + */
148 + if (unlikely(napi_disable_pending(n))) {
149 + napi_complete(n);
150 + return work;
151 + }
152 +
153 + if (n->gro_bitmask) {
154 + /* flush too old packets
155 + * If HZ < 1000, flush all packets.
156 + */
157 + napi_gro_flush(n, HZ >= 1000);
158 + }
159 +
160 + gro_normal_list(n);
161 +
162 + *repoll = true;
163 +
164 + return work;
165 +}
166 +
167 +static void napi_workfn(struct work_struct *work)
168 +{
169 + struct napi_struct *n = container_of(work, struct napi_struct, work);
170 + void *have;
171 +
172 + for (;;) {
173 + bool repoll = false;
174 +
175 + local_bh_disable();
176 +
177 + have = netpoll_poll_lock(n);
178 + __napi_poll(n, &repoll);
179 + netpoll_poll_unlock(have);
180 +
181 + local_bh_enable();
182 +
183 + if (!repoll)
184 + return;
185 +
186 + if (!need_resched())
187 + continue;
188 +
189 + /*
190 + * have to pay for the latency of task switch even if
191 + * napi is scheduled
192 + */
193 + queue_work(napi_workq, work);
194 + return;
195 + }
196 +}
197 +
198 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
199 int (*poll)(struct napi_struct *, int), int weight)
200 {
201 @@ -6735,6 +6826,7 @@ void netif_napi_add(struct net_device *d
202 #ifdef CONFIG_NETPOLL
203 napi->poll_owner = -1;
204 #endif
205 + INIT_WORK(&napi->work, napi_workfn);
206 set_bit(NAPI_STATE_SCHED, &napi->state);
207 set_bit(NAPI_STATE_NPSVC, &napi->state);
208 list_add_rcu(&napi->dev_list, &dev->napi_list);
209 @@ -6777,6 +6869,7 @@ void __netif_napi_del(struct napi_struct
210 if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
211 return;
212
213 + cancel_work_sync(&napi->work);
214 napi_hash_del(napi);
215 list_del_rcu(&napi->dev_list);
216 napi_free_frags(napi);
217 @@ -6788,53 +6881,19 @@ EXPORT_SYMBOL(__netif_napi_del);
218
219 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
220 {
221 + bool do_repoll = false;
222 void *have;
223 - int work, weight;
224 + int work;
225
226 list_del_init(&n->poll_list);
227
228 have = netpoll_poll_lock(n);
229
230 - weight = n->weight;
231 + work = __napi_poll(n, &do_repoll);
232
233 - /* This NAPI_STATE_SCHED test is for avoiding a race
234 - * with netpoll's poll_napi(). Only the entity which
235 - * obtains the lock and sees NAPI_STATE_SCHED set will
236 - * actually make the ->poll() call. Therefore we avoid
237 - * accidentally calling ->poll() when NAPI is not scheduled.
238 - */
239 - work = 0;
240 - if (test_bit(NAPI_STATE_SCHED, &n->state)) {
241 - work = n->poll(n, weight);
242 - trace_napi_poll(n, work, weight);
243 - }
244 -
245 - if (unlikely(work > weight))
246 - pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
247 - n->poll, work, weight);
248 -
249 - if (likely(work < weight))
250 + if (!do_repoll)
251 goto out_unlock;
252
253 - /* Drivers must not modify the NAPI state if they
254 - * consume the entire weight. In such cases this code
255 - * still "owns" the NAPI instance and therefore can
256 - * move the instance around on the list at-will.
257 - */
258 - if (unlikely(napi_disable_pending(n))) {
259 - napi_complete(n);
260 - goto out_unlock;
261 - }
262 -
263 - if (n->gro_bitmask) {
264 - /* flush too old packets
265 - * If HZ < 1000, flush all packets.
266 - */
267 - napi_gro_flush(n, HZ >= 1000);
268 - }
269 -
270 - gro_normal_list(n);
271 -
272 /* Some drivers may have called napi_schedule
273 * prior to exhausting their budget.
274 */
275 @@ -11288,6 +11347,10 @@ static int __init net_dev_init(void)
276 sd->backlog.weight = weight_p;
277 }
278
279 + napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
280 + WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
281 + BUG_ON(!napi_workq);
282 +
283 dev_boot_phase = 0;
284
285 /* The loopback device is special if any other network devices