kernel: add minimal TCP state tracking to flow offload support
[openwrt/staging/blogic.git] / target / linux / generic / backport-4.14 / 360-netfilter-nf_flow_table-add-hardware-offload-support.patch
1 From: Pablo Neira Ayuso <pablo@netfilter.org>
2 Date: Thu, 11 Jan 2018 16:32:00 +0100
3 Subject: [PATCH] netfilter: nf_flow_table: add hardware offload support
4
5 This patch adds the infrastructure to offload flows to hardware, in case
6 the nic/switch comes with built-in flow tables capabilities.
7
8 If the hardware comes with no hardware flow tables or they have
9 limitations in terms of features, the existing infrastructure falls back
10 to the software flow table implementation.
11
12 The software flow table garbage collector skips entries that resides in
13 the hardware, so the hardware will be responsible for releasing this
14 flow table entry too via flow_offload_dead().
15
16 Hardware configuration, either to add or to delete entries, is done from
17 the hardware offload workqueue, to ensure this is done from user context
18 given that we may sleep when grabbing the mdio mutex.
19
20 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
21 ---
22 create mode 100644 net/netfilter/nf_flow_table_hw.c
23
24 --- a/include/linux/netdevice.h
25 +++ b/include/linux/netdevice.h
26 @@ -826,6 +826,13 @@ struct xfrmdev_ops {
27 };
28 #endif
29
30 +struct flow_offload;
31 +
32 +enum flow_offload_type {
33 + FLOW_OFFLOAD_ADD = 0,
34 + FLOW_OFFLOAD_DEL,
35 +};
36 +
37 /*
38 * This structure defines the management hooks for network devices.
39 * The following hooks can be defined; unless noted otherwise, they are
40 @@ -1057,6 +1064,10 @@ struct xfrmdev_ops {
41 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
42 * u16 flags);
43 *
44 + * int (*ndo_flow_offload)(enum flow_offload_type type,
45 + * struct flow_offload *flow);
46 + * Adds/deletes flow entry to/from net device flowtable.
47 + *
48 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
49 * Called to change device carrier. Soft-devices (like dummy, team, etc)
50 * which do not represent real hardware may define this to allow their
51 @@ -1281,6 +1292,8 @@ struct net_device_ops {
52 int (*ndo_bridge_dellink)(struct net_device *dev,
53 struct nlmsghdr *nlh,
54 u16 flags);
55 + int (*ndo_flow_offload)(enum flow_offload_type type,
56 + struct flow_offload *flow);
57 int (*ndo_change_carrier)(struct net_device *dev,
58 bool new_carrier);
59 int (*ndo_get_phys_port_id)(struct net_device *dev,
60 --- a/include/net/netfilter/nf_flow_table.h
61 +++ b/include/net/netfilter/nf_flow_table.h
62 @@ -20,11 +20,17 @@ struct nf_flowtable_type {
63 struct module *owner;
64 };
65
66 +enum nf_flowtable_flags {
67 + NF_FLOWTABLE_F_HW = 0x1,
68 +};
69 +
70 struct nf_flowtable {
71 struct list_head list;
72 struct rhashtable rhashtable;
73 const struct nf_flowtable_type *type;
74 + u32 flags;
75 struct delayed_work gc_work;
76 + possible_net_t ft_net;
77 };
78
79 enum flow_offload_tuple_dir {
80 @@ -68,6 +74,7 @@ struct flow_offload_tuple_rhash {
81 #define FLOW_OFFLOAD_SNAT 0x1
82 #define FLOW_OFFLOAD_DNAT 0x2
83 #define FLOW_OFFLOAD_DYING 0x4
84 +#define FLOW_OFFLOAD_HW 0x8
85
86 struct flow_offload {
87 struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
88 @@ -121,6 +128,22 @@ unsigned int nf_flow_offload_ip_hook(voi
89 unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
90 const struct nf_hook_state *state);
91
92 +void nf_flow_offload_hw_add(struct net *net, struct flow_offload *flow,
93 + struct nf_conn *ct);
94 +void nf_flow_offload_hw_del(struct net *net, struct flow_offload *flow);
95 +
96 +struct nf_flow_table_hw {
97 + struct module *owner;
98 + void (*add)(struct net *net, struct flow_offload *flow,
99 + struct nf_conn *ct);
100 + void (*del)(struct net *net, struct flow_offload *flow);
101 +};
102 +
103 +int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload);
104 +void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload);
105 +
106 +extern struct work_struct nf_flow_offload_hw_work;
107 +
108 #define MODULE_ALIAS_NF_FLOWTABLE(family) \
109 MODULE_ALIAS("nf-flowtable-" __stringify(family))
110
111 --- a/include/uapi/linux/netfilter/nf_tables.h
112 +++ b/include/uapi/linux/netfilter/nf_tables.h
113 @@ -1341,6 +1341,7 @@ enum nft_object_attributes {
114 * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
115 * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
116 * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
117 + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32)
118 */
119 enum nft_flowtable_attributes {
120 NFTA_FLOWTABLE_UNSPEC,
121 @@ -1350,6 +1351,7 @@ enum nft_flowtable_attributes {
122 NFTA_FLOWTABLE_USE,
123 NFTA_FLOWTABLE_HANDLE,
124 NFTA_FLOWTABLE_PAD,
125 + NFTA_FLOWTABLE_FLAGS,
126 __NFTA_FLOWTABLE_MAX
127 };
128 #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1)
129 --- a/net/netfilter/Kconfig
130 +++ b/net/netfilter/Kconfig
131 @@ -686,6 +686,15 @@ config NF_FLOW_TABLE
132
133 To compile it as a module, choose M here.
134
135 +config NF_FLOW_TABLE_HW
136 + tristate "Netfilter flow table hardware offload module"
137 + depends on NF_FLOW_TABLE
138 + help
139 + This option adds hardware offload support for the flow table core
140 + infrastructure.
141 +
142 + To compile it as a module, choose M here.
143 +
144 config NETFILTER_XTABLES
145 tristate "Netfilter Xtables support (required for ip_tables)"
146 default m if NETFILTER_ADVANCED=n
147 --- a/net/netfilter/Makefile
148 +++ b/net/netfilter/Makefile
149 @@ -116,6 +116,7 @@ obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_t
150 nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
151
152 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
153 +obj-$(CONFIG_NF_FLOW_TABLE_HW) += nf_flow_table_hw.o
154
155 # generic X tables
156 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
157 --- a/net/netfilter/nf_flow_table_core.c
158 +++ b/net/netfilter/nf_flow_table_core.c
159 @@ -167,9 +167,16 @@ int flow_offload_add(struct nf_flowtable
160 }
161 EXPORT_SYMBOL_GPL(flow_offload_add);
162
163 +static inline bool nf_flow_in_hw(const struct flow_offload *flow)
164 +{
165 + return flow->flags & FLOW_OFFLOAD_HW;
166 +}
167 +
168 static void flow_offload_del(struct nf_flowtable *flow_table,
169 struct flow_offload *flow)
170 {
171 + struct net *net = read_pnet(&flow_table->ft_net);
172 +
173 rhashtable_remove_fast(&flow_table->rhashtable,
174 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
175 nf_flow_offload_rhash_params);
176 @@ -177,6 +184,9 @@ static void flow_offload_del(struct nf_f
177 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
178 nf_flow_offload_rhash_params);
179
180 + if (nf_flow_in_hw(flow))
181 + nf_flow_offload_hw_del(net, flow);
182 +
183 flow_offload_free(flow);
184 }
185
186 @@ -263,6 +273,10 @@ static int nf_flow_offload_gc_step(struc
187
188 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
189
190 + if (nf_flow_in_hw(flow) &&
191 + !nf_flow_is_dying(flow))
192 + continue;
193 +
194 if (nf_flow_has_expired(flow) ||
195 nf_flow_is_dying(flow))
196 flow_offload_del(flow_table, flow);
197 @@ -399,10 +413,43 @@ int nf_flow_dnat_port(const struct flow_
198 }
199 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
200
201 +static const struct nf_flow_table_hw __rcu *nf_flow_table_hw_hook __read_mostly;
202 +
203 +static int nf_flow_offload_hw_init(struct nf_flowtable *flow_table)
204 +{
205 + const struct nf_flow_table_hw *offload;
206 +
207 + if (!rcu_access_pointer(nf_flow_table_hw_hook))
208 + request_module("nf-flow-table-hw");
209 +
210 + rcu_read_lock();
211 + offload = rcu_dereference(nf_flow_table_hw_hook);
212 + if (!offload)
213 + goto err_no_hw_offload;
214 +
215 + if (!try_module_get(offload->owner))
216 + goto err_no_hw_offload;
217 +
218 + rcu_read_unlock();
219 +
220 + return 0;
221 +
222 +err_no_hw_offload:
223 + rcu_read_unlock();
224 +
225 + return -EOPNOTSUPP;
226 +}
227 +
228 int nf_flow_table_init(struct nf_flowtable *flowtable)
229 {
230 int err;
231
232 + if (flowtable->flags & NF_FLOWTABLE_F_HW) {
233 + err = nf_flow_offload_hw_init(flowtable);
234 + if (err)
235 + return err;
236 + }
237 +
238 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
239
240 err = rhashtable_init(&flowtable->rhashtable,
241 @@ -436,6 +483,8 @@ static void nf_flow_table_iterate_cleanu
242 {
243 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
244 flush_delayed_work(&flowtable->gc_work);
245 + if (flowtable->flags & NF_FLOWTABLE_F_HW)
246 + flush_work(&nf_flow_offload_hw_work);
247 }
248
249 void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
250 @@ -449,6 +498,26 @@ void nf_flow_table_cleanup(struct net *n
251 }
252 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
253
254 +struct work_struct nf_flow_offload_hw_work;
255 +EXPORT_SYMBOL_GPL(nf_flow_offload_hw_work);
256 +
257 +/* Give the hardware workqueue the chance to remove entries from hardware.*/
258 +static void nf_flow_offload_hw_free(struct nf_flowtable *flowtable)
259 +{
260 + const struct nf_flow_table_hw *offload;
261 +
262 + flush_work(&nf_flow_offload_hw_work);
263 +
264 + rcu_read_lock();
265 + offload = rcu_dereference(nf_flow_table_hw_hook);
266 + if (!offload) {
267 + rcu_read_unlock();
268 + return;
269 + }
270 + module_put(offload->owner);
271 + rcu_read_unlock();
272 +}
273 +
274 void nf_flow_table_free(struct nf_flowtable *flow_table)
275 {
276 mutex_lock(&flowtable_lock);
277 @@ -458,9 +527,58 @@ void nf_flow_table_free(struct nf_flowta
278 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
279 WARN_ON(!nf_flow_offload_gc_step(flow_table));
280 rhashtable_destroy(&flow_table->rhashtable);
281 + if (flow_table->flags & NF_FLOWTABLE_F_HW)
282 + nf_flow_offload_hw_free(flow_table);
283 }
284 EXPORT_SYMBOL_GPL(nf_flow_table_free);
285
286 +/* Must be called from user context. */
287 +void nf_flow_offload_hw_add(struct net *net, struct flow_offload *flow,
288 + struct nf_conn *ct)
289 +{
290 + const struct nf_flow_table_hw *offload;
291 +
292 + rcu_read_lock();
293 + offload = rcu_dereference(nf_flow_table_hw_hook);
294 + if (offload)
295 + offload->add(net, flow, ct);
296 + rcu_read_unlock();
297 +}
298 +EXPORT_SYMBOL_GPL(nf_flow_offload_hw_add);
299 +
300 +/* Must be called from user context. */
301 +void nf_flow_offload_hw_del(struct net *net, struct flow_offload *flow)
302 +{
303 + const struct nf_flow_table_hw *offload;
304 +
305 + rcu_read_lock();
306 + offload = rcu_dereference(nf_flow_table_hw_hook);
307 + if (offload)
308 + offload->del(net, flow);
309 + rcu_read_unlock();
310 +}
311 +EXPORT_SYMBOL_GPL(nf_flow_offload_hw_del);
312 +
313 +int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload)
314 +{
315 + if (rcu_access_pointer(nf_flow_table_hw_hook))
316 + return -EBUSY;
317 +
318 + rcu_assign_pointer(nf_flow_table_hw_hook, offload);
319 +
320 + return 0;
321 +}
322 +EXPORT_SYMBOL_GPL(nf_flow_table_hw_register);
323 +
324 +void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload)
325 +{
326 + WARN_ON(rcu_access_pointer(nf_flow_table_hw_hook) != offload);
327 + rcu_assign_pointer(nf_flow_table_hw_hook, NULL);
328 +
329 + synchronize_rcu();
330 +}
331 +EXPORT_SYMBOL_GPL(nf_flow_table_hw_unregister);
332 +
333 static int nf_flow_table_netdev_event(struct notifier_block *this,
334 unsigned long event, void *ptr)
335 {
336 --- /dev/null
337 +++ b/net/netfilter/nf_flow_table_hw.c
338 @@ -0,0 +1,169 @@
339 +#include <linux/kernel.h>
340 +#include <linux/init.h>
341 +#include <linux/module.h>
342 +#include <linux/netfilter.h>
343 +#include <linux/rhashtable.h>
344 +#include <linux/netdevice.h>
345 +#include <net/netfilter/nf_flow_table.h>
346 +#include <net/netfilter/nf_conntrack.h>
347 +#include <net/netfilter/nf_conntrack_core.h>
348 +#include <net/netfilter/nf_conntrack_tuple.h>
349 +
350 +static DEFINE_SPINLOCK(flow_offload_hw_pending_list_lock);
351 +static LIST_HEAD(flow_offload_hw_pending_list);
352 +
353 +static DEFINE_MUTEX(nf_flow_offload_hw_mutex);
354 +
355 +struct flow_offload_hw {
356 + struct list_head list;
357 + enum flow_offload_type type;
358 + struct flow_offload *flow;
359 + struct nf_conn *ct;
360 + possible_net_t flow_hw_net;
361 +};
362 +
363 +static int do_flow_offload_hw(struct net *net, struct flow_offload *flow,
364 + int type)
365 +{
366 + struct net_device *indev;
367 + int ret, ifindex;
368 +
369 + ifindex = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx;
370 + indev = dev_get_by_index(net, ifindex);
371 + if (WARN_ON(!indev))
372 + return 0;
373 +
374 + mutex_lock(&nf_flow_offload_hw_mutex);
375 + ret = indev->netdev_ops->ndo_flow_offload(type, flow);
376 + mutex_unlock(&nf_flow_offload_hw_mutex);
377 +
378 + dev_put(indev);
379 +
380 + return ret;
381 +}
382 +
383 +static void flow_offload_hw_work_add(struct flow_offload_hw *offload)
384 +{
385 + struct net *net;
386 + int ret;
387 +
388 + if (nf_ct_is_dying(offload->ct))
389 + return;
390 +
391 + net = read_pnet(&offload->flow_hw_net);
392 + ret = do_flow_offload_hw(net, offload->flow, FLOW_OFFLOAD_ADD);
393 + if (ret >= 0)
394 + offload->flow->flags |= FLOW_OFFLOAD_HW;
395 +}
396 +
397 +static void flow_offload_hw_work_del(struct flow_offload_hw *offload)
398 +{
399 + struct net *net = read_pnet(&offload->flow_hw_net);
400 +
401 + do_flow_offload_hw(net, offload->flow, FLOW_OFFLOAD_DEL);
402 +}
403 +
404 +static void flow_offload_hw_work(struct work_struct *work)
405 +{
406 + struct flow_offload_hw *offload, *next;
407 + LIST_HEAD(hw_offload_pending);
408 +
409 + spin_lock_bh(&flow_offload_hw_pending_list_lock);
410 + list_replace_init(&flow_offload_hw_pending_list, &hw_offload_pending);
411 + spin_unlock_bh(&flow_offload_hw_pending_list_lock);
412 +
413 + list_for_each_entry_safe(offload, next, &hw_offload_pending, list) {
414 + switch (offload->type) {
415 + case FLOW_OFFLOAD_ADD:
416 + flow_offload_hw_work_add(offload);
417 + break;
418 + case FLOW_OFFLOAD_DEL:
419 + flow_offload_hw_work_del(offload);
420 + break;
421 + }
422 + if (offload->ct)
423 + nf_conntrack_put(&offload->ct->ct_general);
424 + list_del(&offload->list);
425 + kfree(offload);
426 + }
427 +}
428 +
429 +static void flow_offload_queue_work(struct flow_offload_hw *offload)
430 +{
431 + spin_lock_bh(&flow_offload_hw_pending_list_lock);
432 + list_add_tail(&offload->list, &flow_offload_hw_pending_list);
433 + spin_unlock_bh(&flow_offload_hw_pending_list_lock);
434 +
435 + schedule_work(&nf_flow_offload_hw_work);
436 +}
437 +
438 +static void flow_offload_hw_add(struct net *net, struct flow_offload *flow,
439 + struct nf_conn *ct)
440 +{
441 + struct flow_offload_hw *offload;
442 +
443 + offload = kmalloc(sizeof(struct flow_offload_hw), GFP_ATOMIC);
444 + if (!offload)
445 + return;
446 +
447 + nf_conntrack_get(&ct->ct_general);
448 + offload->type = FLOW_OFFLOAD_ADD;
449 + offload->ct = ct;
450 + offload->flow = flow;
451 + write_pnet(&offload->flow_hw_net, net);
452 +
453 + flow_offload_queue_work(offload);
454 +}
455 +
456 +static void flow_offload_hw_del(struct net *net, struct flow_offload *flow)
457 +{
458 + struct flow_offload_hw *offload;
459 +
460 + offload = kmalloc(sizeof(struct flow_offload_hw), GFP_ATOMIC);
461 + if (!offload)
462 + return;
463 +
464 + offload->type = FLOW_OFFLOAD_DEL;
465 + offload->ct = NULL;
466 + offload->flow = flow;
467 + write_pnet(&offload->flow_hw_net, net);
468 +
469 + flow_offload_queue_work(offload);
470 +}
471 +
472 +static const struct nf_flow_table_hw flow_offload_hw = {
473 + .add = flow_offload_hw_add,
474 + .del = flow_offload_hw_del,
475 + .owner = THIS_MODULE,
476 +};
477 +
478 +static int __init nf_flow_table_hw_module_init(void)
479 +{
480 + INIT_WORK(&nf_flow_offload_hw_work, flow_offload_hw_work);
481 + nf_flow_table_hw_register(&flow_offload_hw);
482 +
483 + return 0;
484 +}
485 +
486 +static void __exit nf_flow_table_hw_module_exit(void)
487 +{
488 + struct flow_offload_hw *offload, *next;
489 + LIST_HEAD(hw_offload_pending);
490 +
491 + nf_flow_table_hw_unregister(&flow_offload_hw);
492 + cancel_work_sync(&nf_flow_offload_hw_work);
493 +
494 + list_for_each_entry_safe(offload, next, &hw_offload_pending, list) {
495 + if (offload->ct)
496 + nf_conntrack_put(&offload->ct->ct_general);
497 + list_del(&offload->list);
498 + kfree(offload);
499 + }
500 +}
501 +
502 +module_init(nf_flow_table_hw_module_init);
503 +module_exit(nf_flow_table_hw_module_exit);
504 +
505 +MODULE_LICENSE("GPL");
506 +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
507 +MODULE_ALIAS("nf-flow-table-hw");
508 --- a/net/netfilter/nf_tables_api.c
509 +++ b/net/netfilter/nf_tables_api.c
510 @@ -4866,6 +4866,14 @@ static int nf_tables_flowtable_parse_hoo
511 if (err < 0)
512 goto err1;
513
514 + for (i = 0; i < n; i++) {
515 + if (flowtable->data.flags & NF_FLOWTABLE_F_HW &&
516 + !dev_array[i]->netdev_ops->ndo_flow_offload) {
517 + err = -EOPNOTSUPP;
518 + goto err1;
519 + }
520 + }
521 +
522 ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL);
523 if (!ops) {
524 err = -ENOMEM;
525 @@ -4996,10 +5004,19 @@ static int nf_tables_newflowtable(struct
526 }
527
528 flowtable->data.type = type;
529 + write_pnet(&flowtable->data.ft_net, net);
530 +
531 err = type->init(&flowtable->data);
532 if (err < 0)
533 goto err3;
534
535 + if (nla[NFTA_FLOWTABLE_FLAGS]) {
536 + flowtable->data.flags =
537 + ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
538 + if (flowtable->data.flags & ~NF_FLOWTABLE_F_HW)
539 + goto err4;
540 + }
541 +
542 err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
543 flowtable);
544 if (err < 0)
545 @@ -5097,7 +5114,8 @@ static int nf_tables_fill_flowtable_info
546 nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
547 nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
548 nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
549 - NFTA_FLOWTABLE_PAD))
550 + NFTA_FLOWTABLE_PAD) ||
551 + nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
552 goto nla_put_failure;
553
554 nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
555 --- a/net/netfilter/nft_flow_offload.c
556 +++ b/net/netfilter/nft_flow_offload.c
557 @@ -110,6 +110,9 @@ static void nft_flow_offload_eval(const
558 if (ret < 0)
559 goto err_flow_add;
560
561 + if (flowtable->flags & NF_FLOWTABLE_F_HW)
562 + nf_flow_offload_hw_add(nft_net(pkt), flow, ct);
563 +
564 return;
565
566 err_flow_add: