1 From: Pablo Neira Ayuso <pablo@netfilter.org>
2 Date: Sun, 7 Jan 2018 01:04:11 +0100
3 Subject: [PATCH] netfilter: add generic flow table infrastructure
5 This patch defines the API to interact with flow tables, this allows to
6 add, delete and lookup for entries in the flow table. This also adds the
7 generic garbage code that removes entries that have expired, ie. no
8 traffic has been seen for a while.
10 Users of the flow table infrastructure can delete entries via
11 flow_offload_dead(), which sets the dying bit, this signals the garbage
12 collector to release an entry from user context.
14 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
16 create mode 100644 net/netfilter/nf_flow_table.c
18 --- a/include/net/netfilter/nf_flow_table.h
19 +++ b/include/net/netfilter/nf_flow_table.h
21 #ifndef _NF_FLOW_TABLE_H
22 #define _NF_FLOW_TABLE_H
24 +#include <linux/in.h>
25 +#include <linux/in6.h>
26 +#include <linux/netdevice.h>
27 #include <linux/rhashtable.h>
28 +#include <linux/rcupdate.h>
33 @@ -20,4 +25,93 @@ struct nf_flowtable {
34 struct delayed_work gc_work;
37 +enum flow_offload_tuple_dir {
38 + FLOW_OFFLOAD_DIR_ORIGINAL,
39 + FLOW_OFFLOAD_DIR_REPLY,
40 + __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY,
42 +#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1)
44 +struct flow_offload_tuple {
46 + struct in_addr src_v4;
47 + struct in6_addr src_v6;
50 + struct in_addr dst_v4;
51 + struct in6_addr dst_v6;
66 + struct dst_entry *dst_cache;
69 +struct flow_offload_tuple_rhash {
70 + struct rhash_head node;
71 + struct flow_offload_tuple tuple;
74 +#define FLOW_OFFLOAD_SNAT 0x1
75 +#define FLOW_OFFLOAD_DNAT 0x2
76 +#define FLOW_OFFLOAD_DYING 0x4
78 +struct flow_offload {
79 + struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
82 + /* Your private driver data here. */
87 +#define NF_FLOW_TIMEOUT (30 * HZ)
89 +struct nf_flow_route {
91 + struct dst_entry *dst;
93 + } tuple[FLOW_OFFLOAD_DIR_MAX];
96 +struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
97 + struct nf_flow_route *route);
98 +void flow_offload_free(struct flow_offload *flow);
100 +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
101 +void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
102 +struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
103 + struct flow_offload_tuple *tuple);
104 +int nf_flow_table_iterate(struct nf_flowtable *flow_table,
105 + void (*iter)(struct flow_offload *flow, void *data),
107 +void nf_flow_offload_work_gc(struct work_struct *work);
108 +extern const struct rhashtable_params nf_flow_offload_rhash_params;
110 +void flow_offload_dead(struct flow_offload *flow);
112 +int nf_flow_snat_port(const struct flow_offload *flow,
113 + struct sk_buff *skb, unsigned int thoff,
114 + u8 protocol, enum flow_offload_tuple_dir dir);
115 +int nf_flow_dnat_port(const struct flow_offload *flow,
116 + struct sk_buff *skb, unsigned int thoff,
117 + u8 protocol, enum flow_offload_tuple_dir dir);
120 + __be16 source, dest;
123 +#define MODULE_ALIAS_NF_FLOWTABLE(family) \
124 + MODULE_ALIAS("nf-flowtable-" __stringify(family))
126 #endif /* _FLOW_OFFLOAD_H */
127 --- a/net/netfilter/Kconfig
128 +++ b/net/netfilter/Kconfig
129 @@ -661,6 +661,13 @@ endif # NF_TABLES_NETDEV
133 +config NF_FLOW_TABLE
134 + tristate "Netfilter flow table module"
136 + This option adds the flow table core infrastructure.
138 + To compile it as a module, choose M here.
140 config NETFILTER_XTABLES
141 tristate "Netfilter Xtables support (required for ip_tables)"
142 default m if NETFILTER_ADVANCED=n
143 --- a/net/netfilter/Makefile
144 +++ b/net/netfilter/Makefile
145 @@ -110,6 +110,9 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_
146 obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
147 obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
149 +# flow table infrastructure
150 +obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
153 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
156 +++ b/net/netfilter/nf_flow_table.c
158 +#include <linux/kernel.h>
159 +#include <linux/init.h>
160 +#include <linux/module.h>
161 +#include <linux/netfilter.h>
162 +#include <linux/rhashtable.h>
163 +#include <linux/netdevice.h>
164 +#include <net/netfilter/nf_flow_table.h>
165 +#include <net/netfilter/nf_conntrack.h>
166 +#include <net/netfilter/nf_conntrack_core.h>
167 +#include <net/netfilter/nf_conntrack_tuple.h>
169 +struct flow_offload_entry {
170 + struct flow_offload flow;
171 + struct nf_conn *ct;
172 + struct rcu_head rcu_head;
175 +struct flow_offload *
176 +flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
178 + struct flow_offload_entry *entry;
179 + struct flow_offload *flow;
181 + if (unlikely(nf_ct_is_dying(ct) ||
182 + !atomic_inc_not_zero(&ct->ct_general.use)))
185 + entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
187 + goto err_ct_refcnt;
189 + flow = &entry->flow;
191 + if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
192 + goto err_dst_cache_original;
194 + if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
195 + goto err_dst_cache_reply;
199 + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
201 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
202 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
203 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
204 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
205 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
206 + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
207 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
208 + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
211 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
212 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
213 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
214 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
215 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
216 + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
217 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
218 + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
222 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
223 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
224 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
225 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
226 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
227 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
228 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
229 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
231 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
232 + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
233 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
234 + route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
236 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
237 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
238 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
239 + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
240 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
241 + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
242 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
243 + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
245 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
246 + FLOW_OFFLOAD_DIR_ORIGINAL;
247 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
248 + FLOW_OFFLOAD_DIR_REPLY;
250 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
251 + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
252 + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
253 + route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
254 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
255 + route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
256 + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
257 + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
259 + if (ct->status & IPS_SRC_NAT)
260 + flow->flags |= FLOW_OFFLOAD_SNAT;
261 + else if (ct->status & IPS_DST_NAT)
262 + flow->flags |= FLOW_OFFLOAD_DNAT;
266 +err_dst_cache_reply:
267 + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
268 +err_dst_cache_original:
275 +EXPORT_SYMBOL_GPL(flow_offload_alloc);
277 +void flow_offload_free(struct flow_offload *flow)
279 + struct flow_offload_entry *e;
281 + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
282 + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
283 + e = container_of(flow, struct flow_offload_entry, flow);
286 +EXPORT_SYMBOL_GPL(flow_offload_free);
288 +void flow_offload_dead(struct flow_offload *flow)
290 + flow->flags |= FLOW_OFFLOAD_DYING;
292 +EXPORT_SYMBOL_GPL(flow_offload_dead);
294 +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
296 + flow->timeout = (u32)jiffies;
298 + rhashtable_insert_fast(&flow_table->rhashtable,
299 + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
300 + *flow_table->type->params);
301 + rhashtable_insert_fast(&flow_table->rhashtable,
302 + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
303 + *flow_table->type->params);
306 +EXPORT_SYMBOL_GPL(flow_offload_add);
308 +void flow_offload_del(struct nf_flowtable *flow_table,
309 + struct flow_offload *flow)
311 + struct flow_offload_entry *e;
313 + rhashtable_remove_fast(&flow_table->rhashtable,
314 + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
315 + *flow_table->type->params);
316 + rhashtable_remove_fast(&flow_table->rhashtable,
317 + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
318 + *flow_table->type->params);
320 + e = container_of(flow, struct flow_offload_entry, flow);
321 + kfree_rcu(e, rcu_head);
323 +EXPORT_SYMBOL_GPL(flow_offload_del);
325 +struct flow_offload_tuple_rhash *
326 +flow_offload_lookup(struct nf_flowtable *flow_table,
327 + struct flow_offload_tuple *tuple)
329 + return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
330 + *flow_table->type->params);
332 +EXPORT_SYMBOL_GPL(flow_offload_lookup);
334 +static void nf_flow_release_ct(const struct flow_offload *flow)
336 + struct flow_offload_entry *e;
338 + e = container_of(flow, struct flow_offload_entry, flow);
339 + nf_ct_delete(e->ct, 0, 0);
343 +int nf_flow_table_iterate(struct nf_flowtable *flow_table,
344 + void (*iter)(struct flow_offload *flow, void *data),
347 + struct flow_offload_tuple_rhash *tuplehash;
348 + struct rhashtable_iter hti;
349 + struct flow_offload *flow;
352 + err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
356 + rhashtable_walk_start(&hti);
358 + while ((tuplehash = rhashtable_walk_next(&hti))) {
359 + if (IS_ERR(tuplehash)) {
360 + err = PTR_ERR(tuplehash);
361 + if (err != -EAGAIN)
366 + if (tuplehash->tuple.dir)
369 + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
374 + rhashtable_walk_stop(&hti);
375 + rhashtable_walk_exit(&hti);
379 +EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
381 +static inline bool nf_flow_has_expired(const struct flow_offload *flow)
383 + return (__s32)(flow->timeout - (u32)jiffies) <= 0;
386 +static inline bool nf_flow_is_dying(const struct flow_offload *flow)
388 + return flow->flags & FLOW_OFFLOAD_DYING;
391 +void nf_flow_offload_work_gc(struct work_struct *work)
393 + struct flow_offload_tuple_rhash *tuplehash;
394 + struct nf_flowtable *flow_table;
395 + struct rhashtable_iter hti;
396 + struct flow_offload *flow;
399 + flow_table = container_of(work, struct nf_flowtable, gc_work.work);
401 + err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
405 + rhashtable_walk_start(&hti);
407 + while ((tuplehash = rhashtable_walk_next(&hti))) {
408 + if (IS_ERR(tuplehash)) {
409 + err = PTR_ERR(tuplehash);
410 + if (err != -EAGAIN)
415 + if (tuplehash->tuple.dir)
418 + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
420 + if (nf_flow_has_expired(flow) ||
421 + nf_flow_is_dying(flow)) {
422 + flow_offload_del(flow_table, flow);
423 + nf_flow_release_ct(flow);
427 + rhashtable_walk_stop(&hti);
428 + rhashtable_walk_exit(&hti);
430 + queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
432 +EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
434 +static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
436 + const struct flow_offload_tuple *tuple = data;
438 + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
441 +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
443 + const struct flow_offload_tuple_rhash *tuplehash = data;
445 + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
448 +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
451 + const struct flow_offload_tuple *tuple = arg->key;
452 + const struct flow_offload_tuple_rhash *x = ptr;
454 + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
460 +const struct rhashtable_params nf_flow_offload_rhash_params = {
461 + .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
462 + .hashfn = flow_offload_hash,
463 + .obj_hashfn = flow_offload_hash_obj,
464 + .obj_cmpfn = flow_offload_hash_cmp,
465 + .automatic_shrinking = true,
467 +EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
469 +static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
470 + __be16 port, __be16 new_port)
472 + struct tcphdr *tcph;
474 + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
475 + skb_try_make_writable(skb, thoff + sizeof(*tcph)))
478 + tcph = (void *)(skb_network_header(skb) + thoff);
479 + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
484 +static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
485 + __be16 port, __be16 new_port)
487 + struct udphdr *udph;
489 + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
490 + skb_try_make_writable(skb, thoff + sizeof(*udph)))
493 + udph = (void *)(skb_network_header(skb) + thoff);
494 + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
495 + inet_proto_csum_replace2(&udph->check, skb, port,
498 + udph->check = CSUM_MANGLED_0;
504 +static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
505 + u8 protocol, __be16 port, __be16 new_port)
507 + switch (protocol) {
509 + if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
513 + if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
521 +int nf_flow_snat_port(const struct flow_offload *flow,
522 + struct sk_buff *skb, unsigned int thoff,
523 + u8 protocol, enum flow_offload_tuple_dir dir)
525 + struct flow_ports *hdr;
526 + __be16 port, new_port;
528 + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
529 + skb_try_make_writable(skb, thoff + sizeof(*hdr)))
532 + hdr = (void *)(skb_network_header(skb) + thoff);
535 + case FLOW_OFFLOAD_DIR_ORIGINAL:
536 + port = hdr->source;
537 + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
538 + hdr->source = new_port;
540 + case FLOW_OFFLOAD_DIR_REPLY:
542 + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
543 + hdr->dest = new_port;
549 + return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
551 +EXPORT_SYMBOL_GPL(nf_flow_snat_port);
553 +int nf_flow_dnat_port(const struct flow_offload *flow,
554 + struct sk_buff *skb, unsigned int thoff,
555 + u8 protocol, enum flow_offload_tuple_dir dir)
557 + struct flow_ports *hdr;
558 + __be16 port, new_port;
560 + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
561 + skb_try_make_writable(skb, thoff + sizeof(*hdr)))
564 + hdr = (void *)(skb_network_header(skb) + thoff);
567 + case FLOW_OFFLOAD_DIR_ORIGINAL:
569 + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
570 + hdr->dest = new_port;
572 + case FLOW_OFFLOAD_DIR_REPLY:
573 + port = hdr->source;
574 + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
575 + hdr->source = new_port;
581 + return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
583 +EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
585 +MODULE_LICENSE("GPL");
586 +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");