1 From: Pablo Neira Ayuso <pablo@netfilter.org>
2 Date: Thu, 11 Jan 2018 16:32:00 +0100
3 Subject: [PATCH] netfilter: nf_flow_table: add hardware offload support
5 This patch adds the infrastructure to offload flows to hardware, in case
6 the nic/switch comes with built-in flow tables capabilities.
8 If the hardware comes with no hardware flow tables or they have
9 limitations in terms of features, the existing infrastructure falls back
10 to the software flow table implementation.
12 The software flow table garbage collector skips entries that resides in
13 the hardware, so the hardware will be responsible for releasing this
14 flow table entry too via flow_offload_dead().
16 Hardware configuration, either to add or to delete entries, is done from
17 the hardware offload workqueue, to ensure this is done from user context
18 given that we may sleep when grabbing the mdio mutex.
20 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
22 create mode 100644 net/netfilter/nf_flow_table_hw.c
24 --- a/include/linux/netdevice.h
25 +++ b/include/linux/netdevice.h
26 @@ -918,6 +918,13 @@ struct dev_ifalias {
32 +enum flow_offload_type {
33 + FLOW_OFFLOAD_ADD = 0,
38 * This structure defines the management hooks for network devices.
39 * The following hooks can be defined; unless noted otherwise, they are
40 @@ -1150,6 +1157,10 @@ struct dev_ifalias {
41 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
44 + * int (*ndo_flow_offload)(enum flow_offload_type type,
45 + * struct flow_offload *flow);
46 + * Adds/deletes flow entry to/from net device flowtable.
48 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
49 * Called to change device carrier. Soft-devices (like dummy, team, etc)
50 * which do not represent real hardware may define this to allow their
51 @@ -1377,6 +1388,8 @@ struct net_device_ops {
52 int (*ndo_bridge_dellink)(struct net_device *dev,
55 + int (*ndo_flow_offload)(enum flow_offload_type type,
56 + struct flow_offload *flow);
57 int (*ndo_change_carrier)(struct net_device *dev,
59 int (*ndo_get_phys_port_id)(struct net_device *dev,
60 --- a/include/net/netfilter/nf_flow_table.h
61 +++ b/include/net/netfilter/nf_flow_table.h
62 @@ -20,11 +20,17 @@ struct nf_flowtable_type {
66 +enum nf_flowtable_flags {
67 + NF_FLOWTABLE_F_HW = 0x1,
71 struct list_head list;
72 struct rhashtable rhashtable;
73 const struct nf_flowtable_type *type;
75 struct delayed_work gc_work;
76 + possible_net_t ft_net;
79 enum flow_offload_tuple_dir {
80 @@ -69,6 +75,7 @@ struct flow_offload_tuple_rhash {
81 #define FLOW_OFFLOAD_DNAT 0x2
82 #define FLOW_OFFLOAD_DYING 0x4
83 #define FLOW_OFFLOAD_TEARDOWN 0x8
84 +#define FLOW_OFFLOAD_HW 0x10
87 struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
88 @@ -125,6 +132,22 @@ unsigned int nf_flow_offload_ip_hook(voi
89 unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
90 const struct nf_hook_state *state);
92 +void nf_flow_offload_hw_add(struct net *net, struct flow_offload *flow,
93 + struct nf_conn *ct);
94 +void nf_flow_offload_hw_del(struct net *net, struct flow_offload *flow);
96 +struct nf_flow_table_hw {
97 + struct module *owner;
98 + void (*add)(struct net *net, struct flow_offload *flow,
99 + struct nf_conn *ct);
100 + void (*del)(struct net *net, struct flow_offload *flow);
103 +int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload);
104 +void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload);
106 +extern struct work_struct nf_flow_offload_hw_work;
108 #define MODULE_ALIAS_NF_FLOWTABLE(family) \
109 MODULE_ALIAS("nf-flowtable-" __stringify(family))
111 --- a/include/uapi/linux/netfilter/nf_tables.h
112 +++ b/include/uapi/linux/netfilter/nf_tables.h
113 @@ -1464,6 +1464,7 @@ enum nft_object_attributes {
114 * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
115 * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
116 * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
117 + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32)
119 enum nft_flowtable_attributes {
120 NFTA_FLOWTABLE_UNSPEC,
121 @@ -1473,6 +1474,7 @@ enum nft_flowtable_attributes {
123 NFTA_FLOWTABLE_HANDLE,
125 + NFTA_FLOWTABLE_FLAGS,
128 #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1)
129 --- a/net/netfilter/Kconfig
130 +++ b/net/netfilter/Kconfig
131 @@ -714,6 +714,15 @@ config NF_FLOW_TABLE
133 To compile it as a module, choose M here.
135 +config NF_FLOW_TABLE_HW
136 + tristate "Netfilter flow table hardware offload module"
137 + depends on NF_FLOW_TABLE
139 + This option adds hardware offload support for the flow table core
142 + To compile it as a module, choose M here.
144 config NETFILTER_XTABLES
145 tristate "Netfilter Xtables support (required for ip_tables)"
146 default m if NETFILTER_ADVANCED=n
147 --- a/net/netfilter/Makefile
148 +++ b/net/netfilter/Makefile
149 @@ -126,6 +126,7 @@ obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_t
150 nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
152 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
153 +obj-$(CONFIG_NF_FLOW_TABLE_HW) += nf_flow_table_hw.o
156 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
157 --- a/net/netfilter/nf_flow_table_core.c
158 +++ b/net/netfilter/nf_flow_table_core.c
159 @@ -216,10 +216,16 @@ int flow_offload_add(struct nf_flowtable
161 EXPORT_SYMBOL_GPL(flow_offload_add);
163 +static inline bool nf_flow_in_hw(const struct flow_offload *flow)
165 + return flow->flags & FLOW_OFFLOAD_HW;
168 static void flow_offload_del(struct nf_flowtable *flow_table,
169 struct flow_offload *flow)
171 struct flow_offload_entry *e;
172 + struct net *net = read_pnet(&flow_table->ft_net);
174 rhashtable_remove_fast(&flow_table->rhashtable,
175 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
176 @@ -234,6 +240,9 @@ static void flow_offload_del(struct nf_f
177 if (!(flow->flags & FLOW_OFFLOAD_TEARDOWN))
178 flow_offload_fixup_ct_state(e->ct);
180 + if (nf_flow_in_hw(flow))
181 + nf_flow_offload_hw_del(net, flow);
183 flow_offload_free(flow);
186 @@ -347,6 +356,9 @@ static int nf_flow_offload_gc_step(struc
188 nf_ct_offload_timeout(flow);
190 + if (nf_flow_in_hw(flow) && !teardown)
193 if (nf_flow_has_expired(flow) || teardown)
194 flow_offload_del(flow_table, flow);
196 @@ -482,10 +494,43 @@ int nf_flow_dnat_port(const struct flow_
198 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
200 +static const struct nf_flow_table_hw __rcu *nf_flow_table_hw_hook __read_mostly;
202 +static int nf_flow_offload_hw_init(struct nf_flowtable *flow_table)
204 + const struct nf_flow_table_hw *offload;
206 + if (!rcu_access_pointer(nf_flow_table_hw_hook))
207 + request_module("nf-flow-table-hw");
210 + offload = rcu_dereference(nf_flow_table_hw_hook);
212 + goto err_no_hw_offload;
214 + if (!try_module_get(offload->owner))
215 + goto err_no_hw_offload;
224 + return -EOPNOTSUPP;
227 int nf_flow_table_init(struct nf_flowtable *flowtable)
231 + if (flowtable->flags & NF_FLOWTABLE_F_HW) {
232 + err = nf_flow_offload_hw_init(flowtable);
237 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
239 err = rhashtable_init(&flowtable->rhashtable,
240 @@ -523,6 +568,8 @@ static void nf_flow_table_iterate_cleanu
242 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
243 flush_delayed_work(&flowtable->gc_work);
244 + if (flowtable->flags & NF_FLOWTABLE_F_HW)
245 + flush_work(&nf_flow_offload_hw_work);
248 void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
249 @@ -536,6 +583,26 @@ void nf_flow_table_cleanup(struct net *n
251 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
253 +struct work_struct nf_flow_offload_hw_work;
254 +EXPORT_SYMBOL_GPL(nf_flow_offload_hw_work);
256 +/* Give the hardware workqueue the chance to remove entries from hardware.*/
257 +static void nf_flow_offload_hw_free(struct nf_flowtable *flowtable)
259 + const struct nf_flow_table_hw *offload;
261 + flush_work(&nf_flow_offload_hw_work);
264 + offload = rcu_dereference(nf_flow_table_hw_hook);
269 + module_put(offload->owner);
273 void nf_flow_table_free(struct nf_flowtable *flow_table)
275 mutex_lock(&flowtable_lock);
276 @@ -545,9 +612,58 @@ void nf_flow_table_free(struct nf_flowta
277 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
278 WARN_ON(!nf_flow_offload_gc_step(flow_table));
279 rhashtable_destroy(&flow_table->rhashtable);
280 + if (flow_table->flags & NF_FLOWTABLE_F_HW)
281 + nf_flow_offload_hw_free(flow_table);
283 EXPORT_SYMBOL_GPL(nf_flow_table_free);
285 +/* Must be called from user context. */
286 +void nf_flow_offload_hw_add(struct net *net, struct flow_offload *flow,
287 + struct nf_conn *ct)
289 + const struct nf_flow_table_hw *offload;
292 + offload = rcu_dereference(nf_flow_table_hw_hook);
294 + offload->add(net, flow, ct);
297 +EXPORT_SYMBOL_GPL(nf_flow_offload_hw_add);
299 +/* Must be called from user context. */
300 +void nf_flow_offload_hw_del(struct net *net, struct flow_offload *flow)
302 + const struct nf_flow_table_hw *offload;
305 + offload = rcu_dereference(nf_flow_table_hw_hook);
307 + offload->del(net, flow);
310 +EXPORT_SYMBOL_GPL(nf_flow_offload_hw_del);
312 +int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload)
314 + if (rcu_access_pointer(nf_flow_table_hw_hook))
317 + rcu_assign_pointer(nf_flow_table_hw_hook, offload);
321 +EXPORT_SYMBOL_GPL(nf_flow_table_hw_register);
323 +void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload)
325 + WARN_ON(rcu_access_pointer(nf_flow_table_hw_hook) != offload);
326 + rcu_assign_pointer(nf_flow_table_hw_hook, NULL);
330 +EXPORT_SYMBOL_GPL(nf_flow_table_hw_unregister);
332 static int nf_flow_table_netdev_event(struct notifier_block *this,
333 unsigned long event, void *ptr)
336 +++ b/net/netfilter/nf_flow_table_hw.c
338 +#include <linux/kernel.h>
339 +#include <linux/init.h>
340 +#include <linux/module.h>
341 +#include <linux/netfilter.h>
342 +#include <linux/rhashtable.h>
343 +#include <linux/netdevice.h>
344 +#include <net/netfilter/nf_flow_table.h>
345 +#include <net/netfilter/nf_conntrack.h>
346 +#include <net/netfilter/nf_conntrack_core.h>
347 +#include <net/netfilter/nf_conntrack_tuple.h>
349 +static DEFINE_SPINLOCK(flow_offload_hw_pending_list_lock);
350 +static LIST_HEAD(flow_offload_hw_pending_list);
352 +static DEFINE_MUTEX(nf_flow_offload_hw_mutex);
354 +struct flow_offload_hw {
355 + struct list_head list;
356 + enum flow_offload_type type;
357 + struct flow_offload *flow;
358 + struct nf_conn *ct;
359 + possible_net_t flow_hw_net;
362 +static int do_flow_offload_hw(struct net *net, struct flow_offload *flow,
365 + struct net_device *indev;
368 + ifindex = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx;
369 + indev = dev_get_by_index(net, ifindex);
370 + if (WARN_ON(!indev))
373 + mutex_lock(&nf_flow_offload_hw_mutex);
374 + ret = indev->netdev_ops->ndo_flow_offload(type, flow);
375 + mutex_unlock(&nf_flow_offload_hw_mutex);
382 +static void flow_offload_hw_work_add(struct flow_offload_hw *offload)
387 + if (nf_ct_is_dying(offload->ct))
390 + net = read_pnet(&offload->flow_hw_net);
391 + ret = do_flow_offload_hw(net, offload->flow, FLOW_OFFLOAD_ADD);
393 + offload->flow->flags |= FLOW_OFFLOAD_HW;
396 +static void flow_offload_hw_work_del(struct flow_offload_hw *offload)
398 + struct net *net = read_pnet(&offload->flow_hw_net);
400 + do_flow_offload_hw(net, offload->flow, FLOW_OFFLOAD_DEL);
403 +static void flow_offload_hw_work(struct work_struct *work)
405 + struct flow_offload_hw *offload, *next;
406 + LIST_HEAD(hw_offload_pending);
408 + spin_lock_bh(&flow_offload_hw_pending_list_lock);
409 + list_replace_init(&flow_offload_hw_pending_list, &hw_offload_pending);
410 + spin_unlock_bh(&flow_offload_hw_pending_list_lock);
412 + list_for_each_entry_safe(offload, next, &hw_offload_pending, list) {
413 + switch (offload->type) {
414 + case FLOW_OFFLOAD_ADD:
415 + flow_offload_hw_work_add(offload);
417 + case FLOW_OFFLOAD_DEL:
418 + flow_offload_hw_work_del(offload);
422 + nf_conntrack_put(&offload->ct->ct_general);
423 + list_del(&offload->list);
428 +static void flow_offload_queue_work(struct flow_offload_hw *offload)
430 + spin_lock_bh(&flow_offload_hw_pending_list_lock);
431 + list_add_tail(&offload->list, &flow_offload_hw_pending_list);
432 + spin_unlock_bh(&flow_offload_hw_pending_list_lock);
434 + schedule_work(&nf_flow_offload_hw_work);
437 +static void flow_offload_hw_add(struct net *net, struct flow_offload *flow,
438 + struct nf_conn *ct)
440 + struct flow_offload_hw *offload;
442 + offload = kmalloc(sizeof(struct flow_offload_hw), GFP_ATOMIC);
446 + nf_conntrack_get(&ct->ct_general);
447 + offload->type = FLOW_OFFLOAD_ADD;
449 + offload->flow = flow;
450 + write_pnet(&offload->flow_hw_net, net);
452 + flow_offload_queue_work(offload);
455 +static void flow_offload_hw_del(struct net *net, struct flow_offload *flow)
457 + struct flow_offload_hw *offload;
459 + offload = kmalloc(sizeof(struct flow_offload_hw), GFP_ATOMIC);
463 + offload->type = FLOW_OFFLOAD_DEL;
464 + offload->ct = NULL;
465 + offload->flow = flow;
466 + write_pnet(&offload->flow_hw_net, net);
468 + flow_offload_queue_work(offload);
471 +static const struct nf_flow_table_hw flow_offload_hw = {
472 + .add = flow_offload_hw_add,
473 + .del = flow_offload_hw_del,
474 + .owner = THIS_MODULE,
477 +static int __init nf_flow_table_hw_module_init(void)
479 + INIT_WORK(&nf_flow_offload_hw_work, flow_offload_hw_work);
480 + nf_flow_table_hw_register(&flow_offload_hw);
485 +static void __exit nf_flow_table_hw_module_exit(void)
487 + struct flow_offload_hw *offload, *next;
488 + LIST_HEAD(hw_offload_pending);
490 + nf_flow_table_hw_unregister(&flow_offload_hw);
491 + cancel_work_sync(&nf_flow_offload_hw_work);
493 + list_for_each_entry_safe(offload, next, &hw_offload_pending, list) {
495 + nf_conntrack_put(&offload->ct->ct_general);
496 + list_del(&offload->list);
501 +module_init(nf_flow_table_hw_module_init);
502 +module_exit(nf_flow_table_hw_module_exit);
504 +MODULE_LICENSE("GPL");
505 +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
506 +MODULE_ALIAS("nf-flow-table-hw");
507 --- a/net/netfilter/nf_tables_api.c
508 +++ b/net/netfilter/nf_tables_api.c
509 @@ -5399,6 +5399,13 @@ static int nf_tables_flowtable_parse_hoo
513 + for (i = 0; i < n; i++) {
514 + if (flowtable->data.flags & NF_FLOWTABLE_F_HW &&
515 + !dev_array[i]->netdev_ops->ndo_flow_offload) {
516 + return -EOPNOTSUPP;
520 ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL);
523 @@ -5530,10 +5537,19 @@ static int nf_tables_newflowtable(struct
526 flowtable->data.type = type;
527 + write_pnet(&flowtable->data.ft_net, net);
529 err = type->init(&flowtable->data);
533 + if (nla[NFTA_FLOWTABLE_FLAGS]) {
534 + flowtable->data.flags =
535 + ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
536 + if (flowtable->data.flags & ~NF_FLOWTABLE_F_HW)
540 err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
543 @@ -5659,7 +5675,8 @@ static int nf_tables_fill_flowtable_info
544 nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
545 nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
546 nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
547 - NFTA_FLOWTABLE_PAD))
548 + NFTA_FLOWTABLE_PAD) ||
549 + nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
550 goto nla_put_failure;
552 nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
553 --- a/net/netfilter/nft_flow_offload.c
554 +++ b/net/netfilter/nft_flow_offload.c
555 @@ -124,6 +124,9 @@ static void nft_flow_offload_eval(const
559 + if (flowtable->flags & NF_FLOWTABLE_F_HW)
560 + nf_flow_offload_hw_add(nft_net(pkt), flow, ct);