From 67981fefb20e717cea55b42f9081a833fa46b3be Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 16:55:04 +0200 Subject: [PATCH 01/34] netfilter: fix compilation of masquerading without IP_NF_TARGET_MASQUERADE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_NF_NAT_MASQUERADE_IPV6=m # CONFIG_IP6_NF_TARGET_MASQUERADE is not set results in: net/ipv6/netfilter/nf_nat_masquerade_ipv6.c: In function ‘nf_nat_masquerade_ipv6’: net/ipv6/netfilter/nf_nat_masquerade_ipv6.c:41:14: error: ‘struct nf_conn_nat’ has no member named ‘masq_index’ nfct_nat(ct)->masq_index = out->ifindex; ^ net/ipv6/netfilter/nf_nat_masquerade_ipv6.c: In function ‘device_cmp’: net/ipv6/netfilter/nf_nat_masquerade_ipv6.c:61:12: error: ‘const struct nf_conn_nat’ has no member named ‘masq_index’ return nat->masq_index == (int)(long)ifindex; ^ net/ipv6/netfilter/nf_nat_masquerade_ipv6.c:62:1: warning: control reaches end of non-void function [-Wreturn-type] } ^ make[3]: *** [net/ipv6/netfilter/nf_nat_masquerade_ipv6.o] Error 1 Fix this by using the new NF_NAT_MASQUERADE_IPV4 and _IPV6 symbols in include/net/netfilter/nf_nat.h. Reported-by: Jim Davis Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index a71dd333ac68..344b1ab19220 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -32,10 +32,8 @@ struct nf_conn_nat { struct hlist_node bysource; struct nf_conn *ct; union nf_conntrack_nat_help help; -#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ - defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) || \ - defined(CONFIG_IP6_NF_TARGET_MASQUERADE) || \ - defined(CONFIG_IP6_NF_TARGET_MASQUERADE_MODULE) +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \ + IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) int masq_index; #endif }; @@ -68,8 +66,8 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, struct nf_conn_nat *nat, const struct net_device *out) { -#if IS_ENABLED(CONFIG_IP_NF_TARGET_MASQUERADE) || \ - IS_ENABLED(CONFIG_IP6_NF_TARGET_MASQUERADE) +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \ + IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) return nat->masq_index && hooknum == NF_INET_POST_ROUTING && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL && nat->masq_index != out->ifindex; From 39e393bb4f653d38aea40190e1aa9a49062eed4d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 11:02:39 +0200 Subject: [PATCH 02/34] netfilter: nf_tables: add NFTA_MASQ_UNSPEC to nft_masq_attributes To keep this consistent with other nft_*_attributes. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index eeec0ae845ef..66d66dd3ff79 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -806,6 +806,7 @@ enum nft_nat_attributes { * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) */ enum nft_masq_attributes { + NFTA_MASQ_UNSPEC, NFTA_MASQ_FLAGS, __NFTA_MASQ_MAX }; From 3e8dc212a0e68a9a90c97f34a92c4cdd97d19dd3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 17:42:00 +0200 Subject: [PATCH 03/34] netfilter: NFT_CHAIN_NAT_IPV* is independent of NFT_NAT Now that we have masquerading support in nf_tables, the NAT chain can be use with it, not only for SNAT/DNAT. So make this chain type independent of it. While at it, move it inside the scope of 'if NF_NAT_IPV*' to simplify dependencies. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 19 +++++++++---------- net/ipv6/netfilter/Kconfig | 23 +++++++++++++---------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d189c5262bdb..eb6995fc4f67 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,16 +61,6 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. -config NFT_CHAIN_NAT_IPV4 - depends on NF_TABLES_IPV4 - depends on NF_NAT_IPV4 && NFT_NAT - tristate "IPv4 nf_tables nat chain support" - help - This option enables the "nat" chain for IPv4 in nf_tables. This - chain type is used to perform Network Address Translation (NAT) - packet transformations such as the source, destination address and - source and destination ports. - config NFT_REJECT_IPV4 depends on NF_TABLES_IPV4 default NFT_REJECT @@ -94,6 +84,15 @@ config NF_NAT_IPV4 if NF_NAT_IPV4 +config NFT_CHAIN_NAT_IPV4 + depends on NF_TABLES_IPV4 + tristate "IPv4 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv4 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index a8f25306a46a..e854062d0c36 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -40,16 +40,6 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. -config NFT_CHAIN_NAT_IPV6 - depends on NF_TABLES_IPV6 - depends on NF_NAT_IPV6 && NFT_NAT - tristate "IPv6 nf_tables nat chain support" - help - This option enables the "nat" chain for IPv6 in nf_tables. This - chain type is used to perform Network Address Translation (NAT) - packet transformations such as the source, destination address and - source and destination ports. - config NFT_REJECT_IPV6 depends on NF_TABLES_IPV6 default NFT_REJECT @@ -70,6 +60,19 @@ config NF_NAT_IPV6 forms of full Network Address Port Translation. This can be controlled by iptables or nft. +if NF_NAT_IPV6 + +config NFT_CHAIN_NAT_IPV6 + depends on NF_TABLES_IPV6 + tristate "IPv6 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv6 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +endif # NF_NAT_IPV6 + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 From 0bbe80e571c7b866afd92a98edd32a969467a7a9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 17:51:27 +0200 Subject: [PATCH 04/34] netfilter: masquerading needs to be independent of x_tables in Kconfig Users are starting to test nf_tables with no x_tables support. Therefore, masquerading needs to be indenpendent of it from Kconfig. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 27 +++++++++++++++------------ net/ipv6/netfilter/Kconfig | 27 +++++++++++++++------------ 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index eb6995fc4f67..345242a79db6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -93,6 +93,21 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. +config NF_NAT_MASQUERADE_IPV4 + tristate "IPv4 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection). + +config NFT_MASQ_IPV4 + tristate "IPv4 masquerading support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV4 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP @@ -231,18 +246,6 @@ config IP_NF_NAT if IP_NF_NAT -config NF_NAT_MASQUERADE_IPV4 - tristate "IPv4 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection). - -config NFT_MASQ_IPV4 - tristate "IPv4 masquerading support for nf_tables" - depends on NF_TABLES_IPV4 - depends on NFT_MASQ - select NF_NAT_MASQUERADE_IPV4 - config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV4 diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index e854062d0c36..bb1a40db7be1 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -71,6 +71,21 @@ config NFT_CHAIN_NAT_IPV6 packet transformations such as the source, destination address and source and destination ports. +config NF_NAT_MASQUERADE_IPV6 + tristate "IPv6 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection) for IPv6. + +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + endif # NF_NAT_IPV6 config IP6_NF_IPTABLES @@ -261,18 +276,6 @@ config IP6_NF_NAT if IP6_NF_NAT -config NF_NAT_MASQUERADE_IPV6 - tristate "IPv6 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection) for IPv6. - -config NFT_MASQ_IPV6 - tristate "IPv6 masquerade support for nf_tables" - depends on NF_TABLES_IPV6 - depends on NFT_MASQ - select NF_NAT_MASQUERADE_IPV6 - config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV6 From 73e64e1813e9ea45885419d0fff1e628a6ab95d4 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 20:48:26 +0200 Subject: [PATCH 05/34] netfilter: ipset: Fix static checker warning in ip_set_core.c Dan Carpenter reported the following static checker warning: net/netfilter/ipset/ip_set_core.c:1414 call_ad() error: 'nlh->nlmsg_len' from user is not capped properly The payload size is limited now by the max size of size_t. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 5593e97426c4..4ca4e5ca6f57 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1397,7 +1397,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; struct nlmsgerr *errmsg; - size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); + size_t payload = min(SIZE_MAX, + sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *cmdattr; From 0e9871e3f79fd17c691b50a9669220c54ff084a2 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:27 +0400 Subject: [PATCH 06/34] netfilter: ipset: Add skbinfo extension kernel support in the ipset core. Skbinfo extension provides mapping of metainformation with lookup in the ipset tables. This patch defines the flags, the constants, the functions and the structures for the data type independent support of the extension. Note the firewall mark stores in the kernel structures as two 32bit values, but transfered through netlink as one 64bit value. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 56 ++++++++++++++++++++- include/uapi/linux/netfilter/ipset/ip_set.h | 12 +++++ net/netfilter/ipset/ip_set_core.c | 27 +++++++++- 3 files changed, 93 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 96afc29184be..b97aac5142ed 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -57,6 +57,8 @@ enum ip_set_extension { IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), + IPSET_EXT_BIT_SKBINFO = 3, + IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), @@ -65,12 +67,14 @@ enum ip_set_extension { #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) +#define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, + IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; @@ -92,6 +96,10 @@ struct ip_set_ext { u64 packets; u64 bytes; u32 timeout; + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; char *comment; }; @@ -104,6 +112,13 @@ struct ip_set_comment { char *str; }; +struct ip_set_skbinfo { + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; +}; + struct ip_set; #define ext_timeout(e, s) \ @@ -112,7 +127,8 @@ struct ip_set; (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]) #define ext_comment(e, s) \ (struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]) - +#define ext_skbinfo(e, s) \ +(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -256,6 +272,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; @@ -304,6 +322,39 @@ ip_set_update_counter(struct ip_set_counter *counter, } } +static inline void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbmark = skbinfo->skbmark; + mext->skbmarkmask = skbinfo->skbmarkmask; + mext->skbprio = skbinfo->skbprio; + mext->skbqueue = skbinfo->skbqueue; +} +static inline bool +ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) +{ + return nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask)) || + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio)) || + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue)); + +} + +static inline void +ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext) +{ + skbinfo->skbmark = ext->skbmark; + skbinfo->skbmarkmask = ext->skbmarkmask; + skbinfo->skbprio = ext->skbprio; + skbinfo->skbqueue = ext->skbqueue; +} + static inline bool ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter) { @@ -497,6 +548,9 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 78c2f2e79920..ca03119111a2 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -115,6 +115,9 @@ enum { IPSET_ATTR_BYTES, IPSET_ATTR_PACKETS, IPSET_ATTR_COMMENT, + IPSET_ATTR_SKBMARK, + IPSET_ATTR_SKBPRIO, + IPSET_ATTR_SKBQUEUE, __IPSET_ATTR_ADT_MAX, }; #define IPSET_ATTR_ADT_MAX (__IPSET_ATTR_ADT_MAX - 1) @@ -147,6 +150,7 @@ enum ipset_errno { IPSET_ERR_COUNTER, IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, + IPSET_ERR_SKBINFO, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, @@ -170,6 +174,12 @@ enum ipset_cmd_flags { IPSET_FLAG_MATCH_COUNTERS = (1 << IPSET_FLAG_BIT_MATCH_COUNTERS), IPSET_FLAG_BIT_RETURN_NOMATCH = 7, IPSET_FLAG_RETURN_NOMATCH = (1 << IPSET_FLAG_BIT_RETURN_NOMATCH), + IPSET_FLAG_BIT_MAP_SKBMARK = 8, + IPSET_FLAG_MAP_SKBMARK = (1 << IPSET_FLAG_BIT_MAP_SKBMARK), + IPSET_FLAG_BIT_MAP_SKBPRIO = 9, + IPSET_FLAG_MAP_SKBPRIO = (1 << IPSET_FLAG_BIT_MAP_SKBPRIO), + IPSET_FLAG_BIT_MAP_SKBQUEUE = 10, + IPSET_FLAG_MAP_SKBQUEUE = (1 << IPSET_FLAG_BIT_MAP_SKBQUEUE), IPSET_FLAG_CMD_MAX = 15, }; @@ -187,6 +197,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), IPSET_FLAG_BIT_WITH_FORCEADD = 5, IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), + IPSET_FLAG_BIT_WITH_SKBINFO = 6, + IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), IPSET_FLAG_CADT_MAX = 15, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4ca4e5ca6f57..26c795e6b57f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -337,6 +337,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [IPSET_EXT_ID_SKBINFO] = { + .type = IPSET_EXT_SKBINFO, + .flag = IPSET_FLAG_WITH_SKBINFO, + .len = sizeof(struct ip_set_skbinfo), + .align = __alignof__(struct ip_set_skbinfo), + }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, @@ -382,6 +388,7 @@ int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { + u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { if (!(set->extensions & IPSET_EXT_TIMEOUT)) return -IPSET_ERR_TIMEOUT; @@ -402,7 +409,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } - + if (tb[IPSET_ATTR_SKBMARK]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); + ext->skbmark = fullmark >> 32; + ext->skbmarkmask = fullmark & 0xffffffff; + } + if (tb[IPSET_ATTR_SKBPRIO]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbprio = be32_to_cpu(nla_get_be32( + tb[IPSET_ATTR_SKBPRIO])); + } + if (tb[IPSET_ATTR_SKBQUEUE]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbqueue = be16_to_cpu(nla_get_be16( + tb[IPSET_ATTR_SKBQUEUE])); + } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); From 39d1ecf1ad0e19145e1f3a6cd838b7354ef71bf7 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:28 +0400 Subject: [PATCH 07/34] netfilter: ipset: Add skbinfo extension kernel support for the bitmap set types. Add skbinfo extension kernel support for the bitmap set types. Inroduce the new revisions of bitmap_ip, bitmap_ipmac and bitmap_port set types. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_gen.h | 4 ++++ net/netfilter/ipset/ip_set_bitmap_ip.c | 11 +++++++++-- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 11 +++++++++-- net/netfilter/ipset/ip_set_bitmap_port.c | 11 +++++++++-- 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index f2c7d83dc23f..6f024a8a1534 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -128,6 +128,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, return 0; if (SET_WITH_COUNTER(set)) ip_set_update_counter(ext_counter(x, set), ext, mext, flags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(x, set), ext, mext, flags); return 1; } @@ -161,6 +163,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(x, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(x, set), ext); return 0; } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index dafdb39ef042..55b083ec587a 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -139,7 +140,10 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -357,6 +361,9 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index dbad505e79e3..86104744b00f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -240,7 +241,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -394,6 +398,9 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index a4b65ae1986c..005dd36444c3 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -22,7 +22,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -139,7 +140,10 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -291,6 +295,9 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; From af331419d34e2fc0e2d0c629734f8d160f95a3ec Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:29 +0400 Subject: [PATCH 08/34] netfilter: ipset: Add skbinfo extension kernel support for the hash set types. Add skbinfo extension kernel support for the hash set types. Inroduce the new revisions of all hash set types. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 5 +++++ net/netfilter/ipset/ip_set_hash_ip.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipmark.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipport.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipportip.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipportnet.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_net.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netiface.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netnet.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netport.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netportnet.c | 16 +++++++++++++--- 11 files changed, 130 insertions(+), 25 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 8a38890cbe5e..ac3a268287d9 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -720,6 +720,8 @@ reuse_slot: ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(data, set), ext); out: rcu_read_unlock_bh(); @@ -797,6 +799,9 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, if (SET_WITH_COUNTER(set)) ip_set_update_counter(ext_counter(data, set), ext, mext, flags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(data, set), + ext, mext, flags); return mtype_do_data_match(data); } diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e52739938533..76959d79e9d1 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -26,7 +26,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ /* 2 Comments support */ -#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ +/* 3 Forceadd support */ +#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -111,7 +112,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -247,6 +251,9 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -295,6 +302,9 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 4eff0a297254..7abf9788cfa8 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -25,7 +25,8 @@ #include #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ +/* 1 Forceadd support */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa "); @@ -113,7 +114,10 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -244,6 +248,9 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -301,6 +308,9 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index f37a5ae8a5e0..dcbcceb9a52f 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -28,7 +28,8 @@ /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ -#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ +/* 4 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -122,7 +123,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -287,6 +291,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -370,6 +377,9 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 41ef00eda874..7ef93fc887a1 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -28,7 +28,8 @@ /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ -#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ +/* 4 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -124,7 +125,10 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -295,6 +299,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -382,6 +389,9 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7308d84f9277..b6012ad92781 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -30,7 +30,8 @@ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ -#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ +/* 6 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -179,7 +180,10 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -432,6 +436,9 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -541,6 +548,9 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 4c7d495783a3..6b3ac10ac2f1 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -27,7 +27,8 @@ /* 2 nomatch flag support added */ /* 3 Counters support added */ /* 4 Comments support added */ -#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ +/* 5 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -150,7 +151,10 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -318,7 +322,10 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -377,6 +384,9 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index db2606805b35..03cdb69ac9bf 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -28,7 +28,8 @@ /* 2 /0 support added */ /* 3 Counters support added */ /* 4 Comments support added */ -#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ +/* 5 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -281,7 +282,10 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -514,7 +518,10 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -590,6 +597,9 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 96b131366e7b..da00284b3571 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -24,7 +24,8 @@ #include #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ +/* 1 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith "); @@ -171,7 +172,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -394,7 +398,10 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -462,6 +469,9 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 1c645fbd09c7..c0ddb58d19dc 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -29,7 +29,8 @@ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ -#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ +/* 6 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -172,7 +173,10 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -389,7 +393,10 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -489,6 +496,9 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 2f0034347189..b8053d675fc3 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -26,7 +26,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ +/* 1 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith "); @@ -189,7 +190,10 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -460,7 +464,10 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -569,6 +576,9 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; From cbee93d7b71bf9d73382e503a4f60848eec60ea8 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:30 +0400 Subject: [PATCH 09/34] netfilter: ipset: Add skbinfo extension kernel support for the list set type. Add skbinfo extension kernel support for the list set type. Introduce the new revision of the list set type. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_list_set.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index f87adbad6076..f8f682806e36 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -17,7 +17,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comments support added */ +/* 2 Comments support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -73,6 +74,10 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, ip_set_update_counter(ext_counter(e, set), ext, &opt->ext, cmdflags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(e, set), + ext, &opt->ext, + cmdflags); return ret; } } @@ -197,6 +202,8 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); return 0; } @@ -307,6 +314,8 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; @@ -378,7 +387,10 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -667,6 +679,9 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; From 76cea4109ca89dea218fdc652d2e1535fd9b5fc7 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Tue, 2 Sep 2014 14:21:20 +0400 Subject: [PATCH 10/34] netfilter: ipset: Add skbinfo extension support to SET target. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/xt_set.h | 10 ++ net/netfilter/xt_set.c | 155 ++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) diff --git a/include/uapi/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h index 964d3d42f874..d6a1df1f2947 100644 --- a/include/uapi/linux/netfilter/xt_set.h +++ b/include/uapi/linux/netfilter/xt_set.h @@ -71,4 +71,14 @@ struct xt_set_info_match_v3 { __u32 flags; }; +/* Revision 3 target */ + +struct xt_set_info_target_v3 { + struct xt_set_info add_set; + struct xt_set_info del_set; + struct xt_set_info map_set; + __u32 flags; + __u32 timeout; +}; + #endif /*_XT_SET_H*/ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index cb70f6ec5695..5732cd64acc0 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -366,6 +366,140 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 target */ + +static unsigned int +set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + ADT_OPT(map_opt, par->family, info->map_set.dim, + info->map_set.flags, 0, UINT_MAX); + + int ret; + + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + if (info->map_set.index != IPSET_INVALID_ID) { + map_opt.cmdflags |= info->flags & (IPSET_FLAG_MAP_SKBMARK | + IPSET_FLAG_MAP_SKBPRIO | + IPSET_FLAG_MAP_SKBQUEUE); + ret = match_set(info->map_set.index, skb, par, &map_opt, + info->map_set.flags & IPSET_INV_MATCH); + if (!ret) + return XT_CONTINUE; + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK) + skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask)) + ^ (map_opt.ext.skbmark); + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO) + skb->priority = map_opt.ext.skbprio; + if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) && + skb->dev && + skb->dev->real_num_tx_queues > map_opt.ext.skbqueue) + skb_set_queue_mapping(skb, map_opt.ext.skbqueue); + } + return XT_CONTINUE; +} + + +static int +set_target_v3_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + return -ENOENT; + } + } + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_warn("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + !(par->hook_mask & (1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_warn("mapping of prio or/and queue is allowed only" + "from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + index = ip_set_nfnl_get_byindex(par->net, + info->map_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find map_set index %u as target\n", + info->map_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->del_set.index); + return -ENOENT; + } + } + + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX || + info->map_set.dim > IPSET_DIM_MAX) { + pr_warn("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v3_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +} + + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -493,6 +627,27 @@ static struct xt_target set_targets[] __read_mostly = { .destroy = set_target_v2_destroy, .me = THIS_MODULE }, + /* --map-set support */ + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV4, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV6, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, }; static int __init xt_set_init(void) From aef96193fe7b2791c4a3b19fe75426b929769471 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 17:30:54 +0200 Subject: [PATCH 11/34] netfilter: ipset: send nonzero skbinfo extensions only Do not send zero valued skbinfo extensions to userspace at listing. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index b97aac5142ed..f1606fa6132d 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -335,13 +335,17 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, static inline bool ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) { - return nla_put_net64(skb, IPSET_ATTR_SKBMARK, - cpu_to_be64((u64)skbinfo->skbmark << 32 | - skbinfo->skbmarkmask)) || - nla_put_net32(skb, IPSET_ATTR_SKBPRIO, - cpu_to_be32(skbinfo->skbprio)) || - nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, - cpu_to_be16(skbinfo->skbqueue)); + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask))) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); } From 07034aeae152de52c29f032ca995bf9dafbe24e2 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 17:36:06 +0200 Subject: [PATCH 12/34] netfilter: ipset: hash:mac type added to ipset Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/Kconfig | 9 ++ net/netfilter/ipset/Makefile | 1 + net/netfilter/ipset/ip_set_hash_gen.h | 11 +- net/netfilter/ipset/ip_set_hash_mac.c | 173 ++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/ipset/ip_set_hash_mac.c diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 2f7f5c32c6f9..234a8ec82076 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -99,6 +99,15 @@ config IP_SET_HASH_IPPORTNET To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_MAC + tristate "hash:mac set support" + depends on IP_SET + help + This option adds the hash:mac set type support, by which + one can store MAC (ethernet address) elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_NETPORTNET tristate "hash:net,port,net set support" depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 231f10196cb9..3dbd5e958489 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_MAC) += ip_set_hash_mac.o obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index ac3a268287d9..fee7c64e4dd1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1054,8 +1054,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, struct HTYPE *h; struct htable *t; +#ifndef IP_SET_PROTO_UNDEF if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; +#endif #ifdef IP_SET_HASH_WITH_MARKMASK markmask = 0xffffffff; @@ -1137,25 +1139,32 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, rcu_assign_pointer(h->table, t); set->data = h; +#ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { +#endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); +#ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); } +#endif if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +#ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) +#endif IPSET_TOKEN(HTYPE, 4_gc_init)(set, IPSET_TOKEN(HTYPE, 4_gc)); +#ifndef IP_SET_PROTO_UNDEF else IPSET_TOKEN(HTYPE, 6_gc_init)(set, IPSET_TOKEN(HTYPE, 6_gc)); +#endif } - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", set->name, jhash_size(t->htable_bits), t->htable_bits, h->maxelem, set->data, t); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c new file mode 100644 index 000000000000..65690b52a4d5 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -0,0 +1,173 @@ +/* Copyright (C) 2014 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:mac type */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:mac"); + +/* Type specific function prefix */ +#define HTYPE hash_mac + +/* Member elements */ +struct hash_mac4_elem { + /* Zero valued IP addresses cannot be stored */ + union { + unsigned char ether[ETH_ALEN]; + __be32 foo[2]; + }; +}; + +/* Common functions */ + +static inline bool +hash_mac4_data_equal(const struct hash_mac4_elem *e1, + const struct hash_mac4_elem *e2, + u32 *multi) +{ + return ether_addr_equal(e1->ether, e2->ether); +} + +static inline bool +hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) +{ + return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); +} + +static inline void +hash_mac4_data_next(struct hash_mac4_elem *next, + const struct hash_mac4_elem *e) +{ +} + +#define MTYPE hash_mac4 +#define PF 4 +#define HOST_MASK 32 +#define IP_SET_EMIT_CREATE +#define IP_SET_PROTO_UNDEF +#include "ip_set_hash_gen.h" + +/* Zero valued element is not supported */ +static const unsigned char invalid_ether[ETH_ALEN] = { 0 }; + +static int +hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + /* MAC can be src only */ + if (!(opt->flags & IPSET_DIM_ONE_SRC)) + return 0; + + if (skb_mac_header(skb) < skb->head || + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_ETHER] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + return -IPSET_ERR_HASH_ELEM; + + return adtfn(set, &e, &ext, &ext, flags); +} + +static struct ip_set_type hash_mac_type __read_mostly = { + .name = "hash:mac", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_MAC, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_mac_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_mac_init(void) +{ + return ip_set_type_register(&hash_mac_type); +} + +static void __exit +hash_mac_fini(void) +{ + ip_set_type_unregister(&hash_mac_type); +} + +module_init(hash_mac_init); +module_exit(hash_mac_fini); From 616a9be25cb9516e546c0de55d61e1e46e54ade9 Mon Sep 17 00:00:00 2001 From: Kenny Mathis Date: Tue, 9 Sep 2014 09:20:15 -0400 Subject: [PATCH 13/34] ipvs: Add simple weighted failover scheduler Add simple weighted IPVS failover support to the Linux kernel. All other scheduling modules implement some form of load balancing, while this offers a simple failover solution. Connections are directed to the appropriate server based solely on highest weight value and server availability. Tested functionality with keepalived. Signed-off-by: Kenny Mathis Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/Kconfig | 10 +++++ net/netfilter/ipvs/Makefile | 1 + net/netfilter/ipvs/ip_vs_fo.c | 79 +++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 net/netfilter/ipvs/ip_vs_fo.c diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 0c3b1670b0d1..3b6929dec748 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -152,6 +152,16 @@ config IP_VS_WLC If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_FO + tristate "weighted failover scheduling" + ---help--- + The weighted failover scheduling algorithm directs network + connections to the server with the highest weight that is + currently available. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 34ee602ddb66..38b2723b2e3d 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c new file mode 100644 index 000000000000..6a2647d471ff --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -0,0 +1,79 @@ +/* + * IPVS: Weighted Fail Over module + * + * Authors: Kenny Mathis + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Kenny Mathis : added initial functionality based on weight + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* Weighted Fail Over Module */ +static struct ip_vs_dest * +ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *hweight = NULL; + int hw = 0; /* Track highest weight */ + + IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n"); + + /* Basic failover functionality + * Find virtual server with highest weight and send it traffic + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > hw) { + hweight = dest; + hw = atomic_read(&dest->weight); + } + } + + if (hweight) { + IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &hweight->addr), + ntohs(hweight->port), + atomic_read(&hweight->activeconns), + atomic_read(&hweight->weight)); + return hweight; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_fo_scheduler = { + .name = "fo", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list), + .schedule = ip_vs_fo_schedule, +}; + +static int __init ip_vs_fo_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_fo_scheduler); +} + +static void __exit ip_vs_fo_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_fo_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_fo_init); +module_exit(ip_vs_fo_cleanup); +MODULE_LICENSE("GPL"); From 6cff339bbd5f9eda7a5e8a521f91a88d046e6d0c Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:20 -0700 Subject: [PATCH 14/34] ipvs: Add destination address family to netlink interface This is necessary to support heterogeneous pools. For example, if you have an ipv6 addressed network, you'll want to be able to forward ipv4 traffic into it. This patch enforces that destination address family is the same as service family, as none of the forwarding mechanisms support anything else. For the old setsockopt mechanism, we simply set the dest address family to AF_INET as we do with the service. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 +++ include/uapi/linux/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 45 ++++++++++++++++++++++++++++------ 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 624a8a54806d..b7e2b624d58e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -648,6 +648,9 @@ struct ip_vs_dest_user_kern { /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ + + /* Address family of addr */ + u16 af; }; diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index fbcffe8041f7..cabe95d5b461 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -384,6 +384,9 @@ enum { IPVS_DEST_ATTR_PERSIST_CONNS, /* persistent connections */ IPVS_DEST_ATTR_STATS, /* nested attribute for dest stats */ + + IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index bd2b208ba56c..594cec7ebc43 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -816,6 +816,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + dest->af = udest->af; + spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); @@ -846,8 +848,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); + /* Temporary for consistency */ + if (udest->af != svc->af) + return -EINVAL; + #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { + if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -875,12 +881,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, u64_stats_init(&ip_vs_dest_stats->syncp); } - dest->af = svc->af; + dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -928,7 +934,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -949,7 +955,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), @@ -992,7 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -2244,6 +2250,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; + udest->af = AF_INET; } static int @@ -2480,6 +2487,12 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; + /* Cannot expose heterogeneous members via sockopt + * interface + */ + if (dest->af != svc->af) + continue; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); @@ -2777,6 +2790,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, + [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3032,7 +3046,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns))) + atomic_read(&dest->persistconns)) || + nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3113,6 +3128,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; + struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || @@ -3121,6 +3137,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; + nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; @@ -3130,6 +3147,11 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); + if (nla_addr_family) + udest->af = nla_get_u16(nla_addr_family); + else + udest->af = 0; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, @@ -3357,6 +3379,15 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) need_full_dest); if (ret) goto out; + + /* Old protocols did not allow the user to specify address + * family, so we set it to zero instead. We also didn't + * allow heterogeneous pools in the old code, so it's safe + * to assume that this will have the same address family as + * the service. + */ + if (udest.af == 0) + udest.af = svc->af; } switch (cmd) { From 655eef103d0bd99f540a52f7ede032e120756846 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:21 -0700 Subject: [PATCH 15/34] ipvs: Supply destination addr family to ip_vs_{lookup_dest,find_dest} We need to remove the assumption that virtual address family is the same as real address family in order to support heterogeneous services (that is, services with v4 vips and v6 backends or the opposite). Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 5 +++-- net/netfilter/ipvs/ip_vs_conn.c | 8 +++++++- net/netfilter/ipvs/ip_vs_ctl.c | 24 ++++++++++++------------ net/netfilter/ipvs/ip_vs_sync.c | 10 ++++++++-- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b7e2b624d58e..2fa1155b24f7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1399,8 +1399,9 @@ void ip_vs_unregister_nl_ioctl(void); int ip_vs_control_init(void); void ip_vs_control_cleanup(void); struct ip_vs_dest * -ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, - __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, +ip_vs_find_dest(struct net *net, int svc_af, int dest_af, + const union nf_inet_addr *daddr, __be16 dport, + const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags); void ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 610e19c0e13f..8f4c602bb34b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -616,7 +616,13 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; rcu_read_lock(); - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + + /* This function is only invoked by the synchronization code. We do + * not currently support heterogeneous pools with synchronization, + * so we can make the assumption that the svc_af is the same as the + * dest_af + */ + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 594cec7ebc43..c840d895bd1a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -574,8 +574,8 @@ bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, * Called under RCU lock. */ static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; @@ -583,9 +583,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { + if ((dest->af == dest_af) && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && + (dest->port == dport)) { /* HIT */ return dest; } @@ -602,7 +602,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -613,14 +613,14 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; - dest = ip_vs_lookup_dest(svc, daddr, port); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) - dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } @@ -938,7 +938,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { @@ -1002,7 +1002,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { @@ -1084,7 +1084,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index eadffb29dec0..edd266414b7d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -880,8 +880,14 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * but still handled. */ rcu_read_lock(); - dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark, flags); + /* This function is only invoked by the synchronization + * code. We do not currently support heterogeneous pools + * with synchronization, so we can make the assumption that + * the svc_af is the same as the dest_af + */ + dest = ip_vs_find_dest(net, type, type, daddr, dport, + param->vaddr, param->vport, protocol, + fwmark, flags); cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); rcu_read_unlock(); From ad147aa4dd2135e6d86e3329d4009283ba64287f Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:22 -0700 Subject: [PATCH 16/34] ipvs: Pass destination address family to ip_vs_trash_get_dest Part of a series of diffs to tease out destination family from virtual family. This diff just adds a parameter to ip_vs_trash_get and then uses it for comparison rather than svc->af. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c840d895bd1a..6bd2cc682137 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -657,8 +657,8 @@ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) * scheduling. */ static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); @@ -671,11 +671,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - if (dest->af == svc->af && - ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + if (dest->af == dest_af && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && @@ -950,7 +950,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); + dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " From ba38528aae6ee2d22226c6a78727ddc13512b068 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:23 -0700 Subject: [PATCH 17/34] ipvs: Supply destination address family to ip_vs_conn_new The assumption that dest af is equal to service af is now unreliable, so we must specify it manually so as not to copy just the first 4 bytes of a v6 address or doing an illegal read of 16 butes on a v6 address. We "lie" in two places: for synchronization (which we will explicitly disallow from happening when we have heterogeneous pools) and for black hole addresses where there's no real dest. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 ++- net/netfilter/ipvs/ip_vs_conn.c | 5 +++-- net/netfilter/ipvs/ip_vs_core.c | 9 +++++---- net/netfilter/ipvs/ip_vs_ftp.c | 6 ++++-- net/netfilter/ipvs/ip_vs_sync.c | 3 ++- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2fa1155b24f7..7600dbe5780e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -535,6 +535,7 @@ struct ip_vs_conn { union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ + __u16 daf; /* Address family of the dest */ #ifdef CONFIG_NET_NS struct net *net; /* Name space */ #endif @@ -1213,7 +1214,7 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); -struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, +struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 8f4c602bb34b..fdb4880a3a79 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -854,7 +854,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(const struct ip_vs_conn_param *p, +ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { @@ -873,6 +873,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); ip_vs_conn_net_set(cp, p->net); cp->af = p->af; + cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; @@ -880,7 +881,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; - ip_vs_addr_set(p->af, &cp->daddr, daddr); + ip_vs_addr_set(cp->daf, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5c34e8d42e01..1f6ecb74821f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -328,7 +328,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ - ct = ip_vs_conn_new(¶m, &dest->addr, dport, + ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); @@ -357,7 +357,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, src_port, &iph->daddr, dst_port, ¶m); - cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, + skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; @@ -479,7 +480,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &dest->addr, + cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); if (!cp) { @@ -550,7 +551,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &daddr, 0, + cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 77c173282f38..a64fa15790e5 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -233,7 +233,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); - n_cp = ip_vs_conn_new(&p, &from, port, + /* As above, this is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -396,7 +397,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - n_cp = ip_vs_conn_new(&p, &cp->daddr, + /* This is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index edd266414b7d..7162c86fd50d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -889,7 +889,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, param->vaddr, param->vport, protocol, fwmark, flags); - cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, + fwmark); rcu_read_unlock(); if (!cp) { kfree(param->pe_data); From 391f503d69779867f05e9296ae523e9002c2d7ee Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:24 -0700 Subject: [PATCH 18/34] ipvs: prevent mixing heterogeneous pools and synchronization The synchronization protocol is not compatible with heterogeneous pools, so we need to verify that we're not turning both on at the same time. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ net/netfilter/ipvs/ip_vs_ctl.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7600dbe5780e..576d7f0bed5d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -990,6 +990,10 @@ struct netns_ipvs { char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* net name space ptr */ struct net *net; /* Needed by timer routines */ + /* Number of heterogeneous destinations, needed because + * heterogeneous are not supported when synchronization is + * enabled */ + unsigned int mixed_address_family_dests; }; #define DEFAULT_SYNC_THRESHOLD 3 diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6bd2cc682137..462760eded94 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -779,6 +779,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_scheduler *sched; int conn_flags; + /* We cannot modify an address and change the address family */ + BUG_ON(!add && udest->af != dest->af); + + if (add && udest->af != svc->af) + ipvs->mixed_address_family_dests++; + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; @@ -1061,6 +1067,9 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, list_del_rcu(&dest->n_list); svc->num_dests--; + if (dest->af != svc->af) + net_ipvs(svc->net)->mixed_address_family_dests--; + if (svcupd) { struct ip_vs_scheduler *sched; @@ -3256,6 +3265,12 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + /* The synchronization protocol is incompatible with mixed family + * services + */ + if (net_ipvs(net)->mixed_address_family_dests > 0) + return -EINVAL; + return start_sync_thread(net, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), From 4a4739d56b006c4b34dfba03c356056e110521ca Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:25 -0700 Subject: [PATCH 19/34] ipvs: Pull out crosses_local_route_boundary logic This logic is repeated in both out_rt functions so it was redundant. Additionally, we'll need to be able to do checks to route v4 to v6 and vice versa in order to deal with heterogeneous pools. This patch also updates the callsites to add an additional parameter to the out route functions. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 144 +++++++++++++++++--------------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 56896a412bce..b3b54d7a6c17 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -157,9 +157,56 @@ retry: return rt; } +#ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} +#endif + +static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, + int rt_mode, + bool new_rt_is_local) +{ + bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); + bool source_is_loopback; + bool old_rt_is_local; + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); + + source_is_loopback = + (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (addr_type & IPV6_ADDR_LOOPBACK); + old_rt_is_local = __ip_vs_is_local_route6( + (struct rt6_info *)skb_dst(skb)); + } else +#endif + { + source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); + old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + } + + if (unlikely(new_rt_is_local)) { + if (!rt_mode_allow_local) + return true; + if (!rt_mode_allow_redirect && !old_rt_is_local) + return true; + } else { + if (!rt_mode_allow_non_local) + return true; + if (source_is_loopback) + return true; + } + return false; +} + /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -218,30 +265,15 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; - if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & - rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", - (rt->rt_flags & RTCF_LOCAL) ? - "local":"non-local", &daddr); + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI4\n", &dest->addr.ip); goto err_put; } iph = ip_hdr(skb); - if (likely(!local)) { - if (unlikely(ipv4_is_loopback(iph->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address " - "%pI4 to non-local address, dest: %pI4\n", - &iph->saddr, &daddr); - goto err_put; - } - } else { - ort = skb_rtable(skb); - if (!(rt_mode & IP_VS_RT_MODE_RDR) && - !(ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to " - "local requires NAT method, dest: %pI4\n", - &iph->daddr, &daddr); - goto err_put; - } + + if (unlikely(local)) { /* skb to local stack, preserve old route */ if (!noref) ip_rt_put(rt); @@ -295,12 +327,6 @@ err_unreach: } #ifdef CONFIG_IP_VS_IPV6 - -static inline int __ip_vs_is_local_route6(struct rt6_info *rt) -{ - return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; -} - static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) @@ -339,7 +365,7 @@ out_err: * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { @@ -393,32 +419,15 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, } local = __ip_vs_is_local_route6(rt); - if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & - rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", - local ? "local":"non-local", daddr); + + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI6\n", &dest->addr.in6); goto err_put; } - if (likely(!local)) { - if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address " - "%pI6c to non-local address, " - "dest: %pI6c\n", - &ipv6_hdr(skb)->saddr, daddr); - goto err_put; - } - } else { - ort = (struct rt6_info *) skb_dst(skb); - if (!(rt_mode & IP_VS_RT_MODE_RDR) && - !__ip_vs_is_local_route6(ort)) { - IP_VS_DBG_RL("Redirect from non-local address %pI6c " - "to local requires NAT method, " - "dest: %pI6c\n", - &ipv6_hdr(skb)->daddr, daddr); - goto err_put; - } + + if (unlikely(local)) { /* skb to local stack, preserve old route */ if (!noref) dst_release(&rt->dst); @@ -556,8 +565,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, - NULL) < 0) + if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, + IP_VS_RT_MODE_NON_LOCAL, NULL) < 0) goto tx_error; ip_send_check(iph); @@ -586,7 +595,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -633,7 +642,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL); @@ -721,8 +730,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR); @@ -829,7 +838,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | @@ -928,7 +937,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1021,7 +1030,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL); @@ -1060,8 +1069,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL); if (local < 0) @@ -1128,7 +1137,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + NULL); if (local < 0) goto tx_error; rt = skb_rtable(skb); @@ -1219,8 +1229,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, rt_mode); + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; rt = (struct rt6_info *) skb_dst(skb); From 919aa0b2bbcf013467295dc9736db6fb575a4fb0 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:26 -0700 Subject: [PATCH 20/34] ipvs: Pull out update_pmtu code Another step toward heterogeneous pools, this removes another piece of functionality currently specific to each address family type. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b3b54d7a6c17..034a282a6f8c 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -204,6 +204,15 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, return false; } +static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) +{ + struct sock *sk = skb->sk; + struct rtable *ort = skb_rtable(skb); + + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, @@ -213,7 +222,6 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ - struct rtable *ort; /* Original route */ struct iphdr *iph; __be16 df; int mtu; @@ -284,16 +292,12 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, mtu = dst_mtu(&rt->dst); df = iph->frag_off & htons(IP_DF); } else { - struct sock *sk = skb->sk; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; } - ort = skb_rtable(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + maybe_update_pmtu(skb_af, skb, mtu); /* MTU check allowed? */ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } @@ -372,7 +376,6 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct net *net = dev_net(skb_dst(skb)->dev); struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ - struct rt6_info *ort; /* Original route */ struct dst_entry *dst; int mtu; int local, noref = 1; @@ -438,17 +441,13 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) mtu = dst_mtu(&rt->dst); else { - struct sock *sk = skb->sk; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); goto err_put; } - ort = (struct rt6_info *) skb_dst(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + maybe_update_pmtu(skb_af, skb, mtu); } if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { From c63e4de2be5e1d253adce16dbba57ed42868bc22 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:27 -0700 Subject: [PATCH 21/34] ipvs: Add generic ensure_mtu_is_adequate to handle mixed pools The out_rt functions check to see if the mtu is large enough for the packet and, if not, send icmp messages (TOOBIG or DEST_UNREACH) to the source and bail out. We needed the ability to send ICMP from the out_rt_v6 function and DEST_UNREACH from the out_rt function, so we just pulled it out into a common function. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 77 ++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 034a282a6f8c..fa2fdd7421b7 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -213,17 +213,57 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } +static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, + struct ip_vs_iphdr *ipvsh, + struct sk_buff *skb, int mtu) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + struct net *net = dev_net(skb_dst(skb)->dev); + + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", + &ipv6_hdr(skb)->saddr); + return false; + } + } else +#endif + { + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + /* If we're going to tunnel the packet and pmtu discovery + * is disabled, we'll just fragment it anyway + */ + if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) + return true; + + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", + &ip_hdr(skb)->saddr); + return false; + } + } + + return true; +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, int rt_mode, __be32 *ret_saddr) + __be32 daddr, int rt_mode, __be32 *ret_saddr, + struct ip_vs_iphdr *ipvsh) { struct net *net = dev_net(skb_dst(skb)->dev); - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ - struct iphdr *iph; - __be16 df; int mtu; int local, noref = 1; @@ -279,7 +319,6 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, " daddr=%pI4\n", &dest->addr.ip); goto err_put; } - iph = ip_hdr(skb); if (unlikely(local)) { /* skb to local stack, preserve old route */ @@ -290,7 +329,6 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { mtu = dst_mtu(&rt->dst); - df = iph->frag_off & htons(IP_DF); } else { mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { @@ -298,16 +336,10 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_put; } maybe_update_pmtu(skb_af, skb, mtu); - /* MTU check allowed? */ - df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } - /* MTU checking */ - if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; - } skb_dst_drop(skb); if (noref) { @@ -450,15 +482,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { - if (!skb->dev) - skb->dev = net->loopback_dev; - /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; - } skb_dst_drop(skb); if (noref) { @@ -565,7 +590,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rcu_read_lock(); if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, - IP_VS_RT_MODE_NON_LOCAL, NULL) < 0) + IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; ip_send_check(iph); @@ -644,7 +669,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL); + IP_VS_RT_MODE_RDR, NULL, ipvsh); if (local < 0) goto tx_error; rt = skb_rtable(skb); @@ -841,7 +866,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | - IP_VS_RT_MODE_TUNNEL, &saddr); + IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); if (local < 0) goto tx_error; if (local) { @@ -1032,7 +1057,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_KNOWN_NH, NULL); + IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); if (local < 0) goto tx_error; if (local) { @@ -1137,7 +1162,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, - NULL); + NULL, iph); if (local < 0) goto tx_error; rt = skb_rtable(skb); From 8052ba292559f907ea2ad4c827d83c195046dfe1 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:28 -0700 Subject: [PATCH 22/34] ipvs: support ipv4 in ipv6 and ipv6 in ipv4 tunnel forwarding Pull the common logic for preparing an skb to prepend the header into a single function and then set fields such that they can be used in either case (generalize tos and tclass to dscp, hop_limit and ttl to ttl, etc) Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 12 ++- net/netfilter/ipvs/ip_vs_xmit.c | 148 +++++++++++++++++++++++--------- 2 files changed, 117 insertions(+), 43 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index fdb4880a3a79..13e9cee02c81 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -488,7 +488,12 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) break; case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else +#endif + cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: @@ -514,7 +519,10 @@ static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) break; case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit_v6; + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else + cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index fa2fdd7421b7..91f17c1eb8a2 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -824,6 +824,81 @@ tx_error: } #endif +/* When forwarding a packet, we must ensure that we've got enough headroom + * for the encapsulation packet in the skb. This also gives us an + * opportunity to figure out what the payload_len, dsfield, ttl, and df + * values should be, so that we won't need to look at the old ip header + * again + */ +static struct sk_buff * +ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, + unsigned int max_headroom, __u8 *next_protocol, + __u32 *payload_len, __u8 *dsfield, __u8 *ttl, + __be16 *df) +{ + struct sk_buff *new_skb = NULL; + struct iphdr *old_iph = NULL; +#ifdef CONFIG_IP_VS_IPV6 + struct ipv6hdr *old_ipv6h = NULL; +#endif + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { + new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) + goto error; + consume_skb(skb); + skb = new_skb; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + old_ipv6h = ipv6_hdr(skb); + *next_protocol = IPPROTO_IPV6; + if (payload_len) + *payload_len = + ntohs(old_ipv6h->payload_len) + + sizeof(*old_ipv6h); + *dsfield = ipv6_get_dsfield(old_ipv6h); + *ttl = old_ipv6h->hop_limit; + if (df) + *df = 0; + } else +#endif + { + old_iph = ip_hdr(skb); + /* Copy DF, reset fragment offset and MF */ + if (df) + *df = (old_iph->frag_off & htons(IP_DF)); + *next_protocol = IPPROTO_IPIP; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + *dsfield = ipv4_get_dsfield(old_iph); + *ttl = old_iph->ttl; + if (payload_len) + *payload_len = ntohs(old_iph->tot_len); + } + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(-ENOMEM); +} + +static inline int __tun_gso_type_mask(int encaps_af, int orig_af) +{ + if (encaps_af == AF_INET) { + if (orig_af == AF_INET) + return SKB_GSO_IPIP; + + return SKB_GSO_SIT; + } + + /* GSO: we need to provide proper SKB_GSO_ value for IPv6: + * SKB_GSO_SIT/IPV6 + */ + return 0; +} /* * IP Tunneling transmitter @@ -852,9 +927,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df; + __u8 next_protocol = 0; + __u8 dsfield = 0; + __u8 ttl = 0; + __be16 df = 0; + __be16 *dfp = NULL; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; @@ -877,29 +954,21 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rt = skb_rtable(skb); tdev = rt->dst.dev; - /* Copy DF, reset fragment offset and MF */ - df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ + dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, NULL, &dsfield, + &ttl, dfp); + if (IS_ERR(skb)) + goto tx_error; - if (!new_skb) - goto tx_error; - consume_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - /* fix old IP header checksum */ - ip_send_check(old_iph); - - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads( + skb, false, __tun_gso_type_mask(AF_INET, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -916,11 +985,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; + iph->protocol = next_protocol; + iph->tos = dsfield; iph->daddr = cp->daddr.ip; iph->saddr = saddr; - iph->ttl = old_iph->ttl; + iph->ttl = ttl; ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ @@ -953,7 +1022,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *old_iph = ipv6_hdr(skb); + __u8 next_protocol = 0; + __u32 payload_len = 0; + __u8 dsfield = 0; + __u8 ttl = 0; struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; @@ -981,19 +1053,14 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, &payload_len, + &dsfield, &ttl, NULL); + if (IS_ERR(skb)) + goto tx_error; - if (!new_skb) - goto tx_error; - consume_skb(skb); - skb = new_skb; - old_iph = ipv6_hdr(skb); - } - - /* GSO: we need to provide proper SKB_GSO_ value for IPv6 */ - skb = iptunnel_handle_offloads(skb, false, 0); /* SKB_GSO_SIT/IPV6 */ + skb = iptunnel_handle_offloads( + skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -1008,14 +1075,13 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ iph = ipv6_hdr(skb); iph->version = 6; - iph->nexthdr = IPPROTO_IPV6; - iph->payload_len = old_iph->payload_len; - be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); + iph->nexthdr = next_protocol; + iph->payload_len = htons(payload_len); memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - ipv6_change_dsfield(iph, 0, ipv6_get_dsfield(old_iph)); + ipv6_change_dsfield(iph, 0, dsfield); iph->daddr = cp->daddr.in6; iph->saddr = saddr; - iph->hop_limit = old_iph->hop_limit; + iph->hop_limit = ttl; /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; From f7fa38006983c0933a550fa790a3b3d3856394d1 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Sep 2014 16:40:29 -0700 Subject: [PATCH 23/34] ipvs: address family of LBLC entry depends on svc family The LBLC entries should use svc->af, not dest->af. Needed to support svc->af != dest->af. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 547ff33c1efd..127f14046c51 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -199,11 +199,11 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, - struct ip_vs_dest *dest) + u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblc_entry *en; - en = ip_vs_lblc_get(dest->af, tbl, daddr); + en = ip_vs_lblc_get(af, tbl, daddr); if (en) { if (en->dest == dest) return en; @@ -213,8 +213,8 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, if (!en) return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; ip_vs_dest_hold(dest); @@ -521,13 +521,13 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) - ip_vs_lblc_new(tbl, &iph->daddr, dest); + ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } From cf34e646dad101170e00712fe51986cbcdad3044 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Sep 2014 16:40:30 -0700 Subject: [PATCH 24/34] ipvs: address family of LBLCR entry depends on svc family The LBLCR entries should use svc->af, not dest->af. Needed to support svc->af != dest->af. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblcr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 3f21a2f47de1..2229d2d8bbe0 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -362,18 +362,18 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, - struct ip_vs_dest *dest) + u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblcr_entry *en; - en = ip_vs_lblcr_get(dest->af, tbl, daddr); + en = ip_vs_lblcr_get(af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); if (!en) return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; /* initialize its dest set */ @@ -706,13 +706,13 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) - ip_vs_lblcr_new(tbl, &iph->daddr, dest); + ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } From 4d316f3f9ae3d5fad8d3198eec0a4ef2511471d7 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 17 Sep 2014 00:09:00 +0300 Subject: [PATCH 25/34] ipvs: use correct address family in scheduler logs Needed to support svc->af != dest->af. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_dh.c | 2 +- net/netfilter/ipvs/ip_vs_fo.c | 2 +- net/netfilter/ipvs/ip_vs_lc.c | 2 +- net/netfilter/ipvs/ip_vs_nq.c | 3 ++- net/netfilter/ipvs/ip_vs_rr.c | 2 +- net/netfilter/ipvs/ip_vs_sed.c | 3 ++- net/netfilter/ipvs/ip_vs_sh.c | 8 ++++---- net/netfilter/ipvs/ip_vs_wlc.c | 3 ++- net/netfilter/ipvs/ip_vs_wrr.c | 2 +- 9 files changed, 15 insertions(+), 12 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index c3b84546ea9e..6be5c538b71e 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -234,7 +234,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index 6a2647d471ff..e09874d02938 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -44,7 +44,7 @@ ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, if (hweight) { IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &hweight->addr), + IP_VS_DBG_ADDR(hweight->af, &hweight->addr), ntohs(hweight->port), atomic_read(&hweight->activeconns), atomic_read(&hweight->weight)); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 2bdcb1cf2127..19a0769a989a 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -59,7 +59,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, else IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " "inactconns %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), + IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->inactconns)); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 961a6de9bb29..a8b63401e773 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -107,7 +107,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, out: IP_VS_DBG_BUF(6, "NQ: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 176b87c35e34..58bacfc461ee 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -95,7 +95,7 @@ stop: spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index e446b9fa7424..f8e2d00f528b 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -108,7 +108,8 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "SED: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index cc65b2f42cd4..98a13433b68c 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -138,7 +138,7 @@ ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); /* if the original dest is unavailable, loop around the table * starting from ihash to find a new dest @@ -153,7 +153,7 @@ ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable " "server %s:%d (offset %d), reselecting", - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), roffset); } @@ -192,7 +192,7 @@ ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", - i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + i, IP_VS_DBG_ADDR(dest->af, &dest->addr), atomic_read(&dest->weight)); /* Don't move to next dest until filling weight */ @@ -342,7 +342,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->saddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index b5b4650d50a9..6b366fd90554 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -80,7 +80,8 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "WLC: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 0546cd572d6b..17e6d4406ca7 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -216,7 +216,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); From f18ae7206eaebfecc2dd8b017b0d6a0950eabf8b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Sep 2014 16:40:38 -0700 Subject: [PATCH 26/34] ipvs: use the new dest addr family field Use the new address family field cp->daf when printing cp->daddr in logs or connection listing. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 49 +++++++++++++++++++++------ net/netfilter/ipvs/ip_vs_core.c | 6 ++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 2 +- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 13e9cee02c81..b0f7b626b56d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -77,6 +78,13 @@ static unsigned int ip_vs_conn_rnd __read_mostly; #define CT_LOCKARRAY_SIZE (1<protocol), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -685,7 +693,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) ip_vs_proto_name(cp->protocol), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -754,7 +762,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct) ntohs(ct->cport), IP_VS_DBG_ADDR(ct->af, &ct->vaddr), ntohs(ct->vport), - IP_VS_DBG_ADDR(ct->af, &ct->daddr), + IP_VS_DBG_ADDR(ct->daf, &ct->daddr), ntohs(ct->dport)); /* @@ -1051,6 +1059,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) struct net *net = seq_file_net(seq); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; + char dbuf[IP_VS_ADDRSTRLEN]; if (!ip_vs_conn_net_eq(cp, net)) return 0; @@ -1064,25 +1073,33 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) } pe_data[len] = '\0'; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " - "%pI6 %04X %-11s %7lu%s\n", + "%s %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), - &cp->daddr.in6, ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ, pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu%s\n", + " %s %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ, pe_data); } @@ -1120,6 +1137,7 @@ static const char *ip_vs_origin_name(unsigned int flags) static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) { + char dbuf[IP_VS_ADDRSTRLEN]; if (v == SEQ_START_TOKEN) seq_puts(seq, @@ -1131,13 +1149,22 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) if (!ip_vs_conn_net_eq(cp, net)) return 0; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%s %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), - &cp->daddr.in6, ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); @@ -1145,11 +1172,11 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) #endif seq_printf(seq, "%-3s %08X %04X %08X %04X " - "%08X %04X %-11s %-6s %7lu\n", + "%s %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 1f6ecb74821f..990decba1fe4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -492,9 +492,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), - IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 2f7ea7564044..5b84c0b56642 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -432,7 +432,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index e3a697234a98..8e92beb0cca9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -510,7 +510,7 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->af, &cp->daddr), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), From bc18d37f676f76edbb5e0c37def78c704b5fbed0 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:39 -0700 Subject: [PATCH 27/34] ipvs: Allow heterogeneous pools now that we support them Remove the temporary consistency check and add a case statement to only allow ipip mixed dests. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 462760eded94..ac7ba689efe7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -854,10 +854,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); - /* Temporary for consistency */ - if (udest->af != svc->af) - return -EINVAL; - #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); @@ -3403,6 +3399,26 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) */ if (udest.af == 0) udest.af = svc->af; + + if (udest.af != svc->af) { + /* The synchronization protocol is incompatible + * with mixed family services + */ + if (net_ipvs(net)->sync_state) { + ret = -EINVAL; + goto out; + } + + /* Which connection types do we support? */ + switch (udest.conn_flags) { + case IP_VS_CONN_F_TUNNEL: + /* We are able to forward this */ + break; + default: + ret = -EINVAL; + goto out; + } + } } switch (cmd) { From fc04733a1a71af26bf30830571b71f5f2a354a06 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 14:53:17 +0200 Subject: [PATCH 28/34] netfilter: nfnetlink: use original skbuff when committing/aborting This allows us to access the original content of the batch from the commit and the abort paths. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c138b8fbe280..f77d3f7f22b5 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -333,7 +333,7 @@ replay: * original skb. */ if (err == -EAGAIN) { - ss->abort(skb); + ss->abort(oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); goto replay; @@ -357,9 +357,9 @@ ack: } done: if (success && done) - ss->commit(skb); + ss->commit(oskb); else - ss->abort(skb); + ss->abort(oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); From 84d7fce693884897c6196cc98228a2ad56ae2a9a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Sep 2014 14:30:22 +0200 Subject: [PATCH 29/34] netfilter: nf_tables: export rule-set generation ID This patch exposes the ruleset generation ID in three ways: 1) The new command NFT_MSG_GETGEN that exposes the 32-bits ruleset generation ID. This ID is incremented in every commit and it should be large enough to avoid wraparound problems. 2) The less significant 16-bits of the generation ID are exposed through the nfgenmsg->res_id header field. This allows us to quickly catch if the ruleset has change between two consecutive list dumps from different object lists (in this specific case I think the risk of wraparound is unlikely). 3) Userspace subscribers may receive notifications of new rule-set generation after every commit. This also provides an alternative way to monitor the generation ID. If the events are lost, the userspace process hits a overrun error, so it knows that it is working with a stale ruleset anyway. Patrick spotted that rule-set transformations in userspace may take quite some time. In that case, it annotates the 32-bits generation ID before fetching the rule-set, then: 1) it compares it to what we obtain after the transformation to make sure it is not working with a stale rule-set and no wraparound has ocurred. 2) it subscribes to ruleset notifications, so it can watch for new generation ID. This is complementary to the NLM_F_DUMP_INTR approach, which allows us to detect an interference in the middle one single list dumping. There is no way to explicitly check that an interference has occurred between two list dumps from the kernel, since it doesn't know how many lists the userspace client is actually going to dump. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 16 +++ net/netfilter/nf_tables_api.c | 140 ++++++++++++++++++----- 2 files changed, 130 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66d66dd3ff79..b72ccfeaf865 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -51,6 +51,8 @@ enum nft_verdicts { * @NFT_MSG_NEWSETELEM: create a new set element (enum nft_set_elem_attributes) * @NFT_MSG_GETSETELEM: get a set element (enum nft_set_elem_attributes) * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) + * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) + * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -68,6 +70,8 @@ enum nf_tables_msg_types { NFT_MSG_NEWSETELEM, NFT_MSG_GETSETELEM, NFT_MSG_DELSETELEM, + NFT_MSG_NEWGEN, + NFT_MSG_GETGEN, NFT_MSG_MAX, }; @@ -812,4 +816,16 @@ enum nft_masq_attributes { }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) +/** + * enum nft_gen_attributes - nf_tables ruleset generation attributes + * + * @NFTA_GEN_ID: Ruleset generation ID (NLA_U32) + */ +enum nft_gen_attributes { + NFTA_GEN_UNSPEC, + NFTA_GEN_ID, + __NFTA_GEN_MAX +}; +#define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 82374601577e..a476b9962155 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -405,9 +405,9 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; -static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table) +static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -420,7 +420,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || @@ -448,8 +448,8 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -488,7 +488,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_table_info(skb, + if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, @@ -540,7 +540,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) @@ -914,9 +914,9 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table, +static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, const struct nft_chain *chain) { struct nlmsghdr *nlh; @@ -930,7 +930,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; @@ -988,8 +988,8 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -1031,7 +1031,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, @@ -1090,7 +1091,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) @@ -1647,8 +1648,9 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, }; -static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, +static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule) @@ -1668,7 +1670,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; @@ -1724,8 +1726,8 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -1771,7 +1773,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, @@ -1837,7 +1839,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule); if (err < 0) @@ -2321,7 +2323,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -2925,7 +2927,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) goto nla_put_failure; @@ -3006,7 +3008,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -3293,6 +3295,87 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, return err; } +static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -EMSGSIZE; +} + +static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + int err; + + if (nlmsg_report(nlh) && + !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + goto err; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) { + kfree_skb(skb2); + goto err; + } + + err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, + NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int err; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call_batch = nf_tables_newtable, @@ -3369,6 +3452,9 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETGEN] = { + .call = nf_tables_getgen, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -3526,6 +3612,8 @@ static int nf_tables_commit(struct sk_buff *skb) call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); } + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + return 0; } From 772476df7047db87ac4174d1ed396512912d23bf Mon Sep 17 00:00:00 2001 From: Rob Jones Date: Fri, 19 Sep 2014 11:27:51 +0100 Subject: [PATCH 30/34] net/netfilter/x_tables.c: use __seq_open_private() Reduce boilerplate code by using __seq_open_private() instead of seq_open() in xt_match_open() and xt_target_open(). Signed-off-by: Rob Jones Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 272ae4d6fdf4..133eb4772f12 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1101,22 +1101,11 @@ static const struct seq_operations xt_match_seq_ops = { static int xt_match_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nf_mttg_trav *trav; - int ret; - - trav = kmalloc(sizeof(*trav), GFP_KERNEL); - if (trav == NULL) + trav = __seq_open_private(file, &xt_match_seq_ops, sizeof(*trav)); + if (!trav) return -ENOMEM; - ret = seq_open(file, &xt_match_seq_ops); - if (ret < 0) { - kfree(trav); - return ret; - } - - seq = file->private_data; - seq->private = trav; trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1165,22 +1154,11 @@ static const struct seq_operations xt_target_seq_ops = { static int xt_target_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nf_mttg_trav *trav; - int ret; - - trav = kmalloc(sizeof(*trav), GFP_KERNEL); - if (trav == NULL) + trav = __seq_open_private(file, &xt_target_seq_ops, sizeof(*trav)); + if (!trav) return -ENOMEM; - ret = seq_open(file, &xt_target_seq_ops); - if (ret < 0) { - kfree(trav); - return ret; - } - - seq = file->private_data; - seq->private = trav; trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } From 7276ca3fa23864133f5ee7431c51546d9b7f695f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Sep 2014 13:28:16 +0200 Subject: [PATCH 31/34] netfilter: bridge: nf_bridge_copy_header as static inline in header Move nf_bridge_copy_header() as static inline in netfilter_bridge.h header file. This patch prepares the modularization of the br_netfilter code. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 48 +++++++++++++++++++++++++------- net/bridge/br_netfilter.c | 28 ------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 8ab1c278b66d..fe996d59de64 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -24,16 +24,6 @@ enum nf_br_hook_priorities { #define BRNF_8021Q 0x10 #define BRNF_PPPoE 0x20 -/* Only used in br_forward.c */ -int nf_bridge_copy_header(struct sk_buff *skb); -static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) -{ - if (skb->nf_bridge && - skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) - return nf_bridge_copy_header(skb); - return 0; -} - static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { @@ -46,6 +36,44 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) } } +static inline void nf_bridge_update_protocol(struct sk_buff *skb) +{ + if (skb->nf_bridge->mask & BRNF_8021Q) + skb->protocol = htons(ETH_P_8021Q); + else if (skb->nf_bridge->mask & BRNF_PPPoE) + skb->protocol = htons(ETH_P_PPP_SES); +} + +/* Fill in the header for fragmented IP packets handled by + * the IPv4 connection tracking code. + * + * Only used in br_forward.c + */ +static inline int nf_bridge_copy_header(struct sk_buff *skb) +{ + int err; + unsigned int header_size; + + nf_bridge_update_protocol(skb); + header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + err = skb_cow_head(skb, header_size); + if (err) + return err; + + skb_copy_to_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); + __skb_push(skb, nf_bridge_encap_header_len(skb)); + return 0; +} + +static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) +{ + if (skb->nf_bridge && + skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) + return nf_bridge_copy_header(skb); + return 0; +} + static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE)) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a615264cf01a..61929a7cd815 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -245,14 +245,6 @@ static inline void nf_bridge_save_header(struct sk_buff *skb) skb->nf_bridge->data, header_size); } -static inline void nf_bridge_update_protocol(struct sk_buff *skb) -{ - if (skb->nf_bridge->mask & BRNF_8021Q) - skb->protocol = htons(ETH_P_8021Q); - else if (skb->nf_bridge->mask & BRNF_PPPoE) - skb->protocol = htons(ETH_P_PPP_SES); -} - /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format @@ -320,26 +312,6 @@ drop: return -1; } -/* Fill in the header for fragmented IP packets handled by - * the IPv4 connection tracking code. - */ -int nf_bridge_copy_header(struct sk_buff *skb) -{ - int err; - unsigned int header_size; - - nf_bridge_update_protocol(skb); - header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); - err = skb_cow_head(skb, header_size); - if (err) - return err; - - skb_copy_to_linear_data_offset(skb, -header_size, - skb->nf_bridge->data, header_size); - __skb_push(skb, nf_bridge_encap_header_len(skb)); - return 0; -} - /* PF_BRIDGE/PRE_ROUTING *********************************************/ /* Undo the changes made for ip6tables PREROUTING and continue the * bridge PRE_ROUTING hook. */ From 34666d467cbf1e2e3c7bb15a63eccfb582cdd71f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Sep 2014 11:29:03 +0200 Subject: [PATCH 32/34] netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- include/linux/netfilter_bridge.h | 2 +- include/linux/skbuff.h | 12 ++-- include/net/neighbour.h | 2 +- include/net/netfilter/ipv4/nf_reject.h | 2 +- include/net/netfilter/ipv6/nf_reject.h | 2 +- net/Kconfig | 7 +- net/bridge/Makefile | 5 +- net/bridge/br.c | 14 ++-- net/bridge/br_device.c | 4 +- net/bridge/br_forward.c | 2 + net/bridge/br_input.c | 1 + net/bridge/br_netfilter.c | 88 +++++------------------ net/bridge/br_netlink.c | 2 +- net/bridge/br_nf_core.c | 96 ++++++++++++++++++++++++++ net/bridge/br_private.h | 12 ++-- net/bridge/br_sysfs_br.c | 4 +- 16 files changed, 151 insertions(+), 104 deletions(-) create mode 100644 net/bridge/br_nf_core.c diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index fe996d59de64..c755e4971fa3 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -15,7 +15,7 @@ enum nf_br_hook_priorities { NF_BR_PRI_LAST = INT_MAX, }; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #define BRNF_PKT_TYPE 0x01 #define BRNF_BRIDGED_DNAT 0x02 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..c4ff43f84573 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -156,7 +156,7 @@ struct nf_conntrack { }; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { atomic_t use; unsigned int mask; @@ -560,7 +560,7 @@ struct sk_buff { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info *nf_bridge; #endif @@ -2977,7 +2977,7 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { if (nf_bridge && atomic_dec_and_test(&nf_bridge->use)) @@ -2995,7 +2995,7 @@ static inline void nf_reset(struct sk_buff *skb) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); skb->nf_bridge = NULL; #endif @@ -3016,7 +3016,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif @@ -3030,7 +3030,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 47f425464f84..f60558d0254c 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -373,7 +373,7 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) return 0; } -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 931fbf812171..f713b5a31d62 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -98,7 +98,7 @@ static void nf_send_reset(struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 710d17ed70b4..7a10cfcd8e33 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -147,7 +147,7 @@ static void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/net/Kconfig b/net/Kconfig index 4051fdfa4367..dc5d700d05e7 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -176,10 +176,11 @@ config NETFILTER_ADVANCED If unsure, say Y. config BRIDGE_NETFILTER - bool "Bridged IP/ARP packets filtering" - depends on BRIDGE && NETFILTER && INET + tristate "Bridged IP/ARP packets filtering" + depends on (BRIDGE || BRIDGE=n) + depends on NETFILTER && INET depends on NETFILTER_ADVANCED - default y + default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 8590b942bffa..5e3eac5dc8b9 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,11 +6,12 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o br_netlink.o + br_stp_if.o br_stp_timer.o br_netlink.o \ + br_nf_core.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o -bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o +obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 1a755a1e5410..44425aff7cba 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -161,7 +161,7 @@ static int __init br_init(void) if (err) goto err_out1; - err = br_netfilter_init(); + err = br_nf_core_init(); if (err) goto err_out2; @@ -179,11 +179,16 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif + pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " + "deprecated. Update your scripts to load br_netfilter if you " + "need this.\n"); + return 0; + err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: - br_netfilter_fini(); + br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: @@ -196,20 +201,17 @@ err_out: static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); - br_netlink_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); - unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ - br_netfilter_fini(); + br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif - br_fdb_fini(); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 568cccd39a3d..659cac15c0df 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,7 +36,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) u16 vid = 0; rcu_read_lock(); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); rcu_read_unlock(); @@ -167,7 +167,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); #endif diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 056b67b0e277..992ec49a96aa 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -49,6 +49,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) return 0; } +EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct sk_buff *skb) { @@ -56,6 +57,7 @@ int br_forward_finish(struct sk_buff *skb) br_dev_queue_push_xmit); } +EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 366c43649079..6fd5522df696 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -140,6 +140,7 @@ drop: kfree_skb(skb); goto out; } +EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 61929a7cd815..97e43937aaca 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -111,66 +111,6 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) -{ -} - -static void fake_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - return NULL; -} - -static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return NULL; -} - -static unsigned int fake_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - -static struct dst_ops fake_dst_ops = { - .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), - .update_pmtu = fake_update_pmtu, - .redirect = fake_redirect, - .cow_metrics = fake_cow_metrics, - .neigh_lookup = fake_neigh_lookup, - .mtu = fake_mtu, -}; - -/* - * Initialize bogus route table used to keep netfilter happy. - * Currently, we fill in the PMTU entry because netfilter - * refragmentation needs it, and the rt_flags entry because - * ipt_REJECT needs it. Future netfilter modules might - * require us to fill additional fields. - */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - -void br_netfilter_rtable_init(struct net_bridge *br) -{ - struct rtable *rt = &br->fake_rtable; - - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; - rt->dst.ops = &fake_dst_ops; -} - static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) { struct net_bridge_port *port; @@ -1031,38 +971,42 @@ static struct ctl_table brnf_table[] = { }; #endif -int __init br_netfilter_init(void) +static int __init br_netfilter_init(void) { int ret; - ret = dst_entries_init(&fake_dst_ops); + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret < 0) return ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) { - dst_entries_destroy(&fake_dst_ops); - return ret; - } #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - dst_entries_destroy(&fake_dst_ops); - return -ENOMEM; + ret = -ENOMEM; + goto err1; } #endif printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; +err1: + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return ret; } -void br_netfilter_fini(void) +static void __exit br_netfilter_fini(void) { nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif - dst_entries_destroy(&fake_dst_ops); } + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 90a91e137acc..0fa66b83685f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -602,7 +602,7 @@ out_af: return err; } -void __exit br_netlink_fini(void) +void br_netlink_fini(void) { br_mdb_uninit(); rtnl_af_unregister(&br_af_ops); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c new file mode 100644 index 000000000000..387cb3bd017c --- /dev/null +++ b/net/bridge/br_nf_core.c @@ -0,0 +1,96 @@ +/* + * Handle firewalling core + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include + +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void fake_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} + +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +static unsigned int fake_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops fake_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .update_pmtu = fake_update_pmtu, + .redirect = fake_redirect, + .cow_metrics = fake_cow_metrics, + .neigh_lookup = fake_neigh_lookup, + .mtu = fake_mtu, +}; + +/* + * Initialize bogus route table used to keep netfilter happy. + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. + */ +static const u32 br_dst_default_metrics[RTAX_MAX] = { + [RTAX_MTU - 1] = 1500, +}; + +void br_netfilter_rtable_init(struct net_bridge *br) +{ + struct rtable *rt = &br->fake_rtable; + + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; + rt->dst.ops = &fake_dst_ops; +} + +int __init br_nf_core_init(void) +{ + return dst_entries_init(&fake_dst_ops); +} + +void br_nf_core_fini(void) +{ + dst_entries_destroy(&fake_dst_ops); +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 62a7fa2e3569..d304d752c091 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -221,7 +221,7 @@ struct net_bridge struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct rtable fake_rtable; bool nf_call_iptables; bool nf_call_ip6tables; @@ -751,13 +751,13 @@ static inline int br_vlan_enabled(struct net_bridge *br) #endif /* br_netfilter.c */ -#ifdef CONFIG_BRIDGE_NETFILTER -int br_netfilter_init(void); -void br_netfilter_fini(void); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +int br_nf_core_init(void); +void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else -#define br_netfilter_init() (0) -#define br_netfilter_fini() do { } while (0) +static inline int br_nf_core_init(void) { return 0; } +static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c9e2572b15f4..cb431c6016ee 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -629,7 +629,7 @@ static ssize_t multicast_startup_query_interval_store( } static DEVICE_ATTR_RW(multicast_startup_query_interval); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { @@ -763,7 +763,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, From 9363dc4b599949bde338cdaba1cf7cac243e4e97 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 23 Sep 2014 13:30:41 +0200 Subject: [PATCH 33/34] netfilter: nf_tables: store and dump set policy We want to know in which cases the user explicitly sets the policy options. In that case, we also want to dump back the info. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4d86198d3d6..3d7292392fac 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -241,6 +241,7 @@ void nft_unregister_set(struct nft_set_ops *ops); * @dtype: data type (verdict or numeric type defined by userspace) * @size: maximum set size * @nelems: number of elements + * @policy: set parameterization (see enum nft_set_policies) * @ops: set ops * @flags: set flags * @klen: key length @@ -255,6 +256,7 @@ struct nft_set { u32 dtype; u32 size; u32 nelems; + u16 policy; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a476b9962155..19e79f0d9ad2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2344,6 +2344,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->policy != NFT_SET_POL_PERFORMANCE) { + if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) + goto nla_put_failure; + } + desc = nla_nest_start(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; @@ -2669,6 +2674,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; + set->policy = policy; err = ops->init(set, &desc, nla); if (err < 0) From db29a9508a9246e77087c5531e45b2c88ec6988b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 11:35:42 +0200 Subject: [PATCH 34/34] netfilter: conntrack: disable generic tracking for known protocols Given following iptables ruleset: -P FORWARD DROP -A FORWARD -m sctp --dport 9 -j ACCEPT -A FORWARD -p tcp --dport 80 -j ACCEPT -A FORWARD -p tcp -m conntrack -m state ESTABLISHED,RELATED -j ACCEPT One would assume that this allows SCTP on port 9 and TCP on port 80. Unfortunately, if the SCTP conntrack module is not loaded, this allows *all* SCTP communication, to pass though, i.e. -p sctp -j ACCEPT, which we think is a security issue. This is because on the first SCTP packet on port 9, we create a dummy "generic l4" conntrack entry without any port information (since conntrack doesn't know how to extract this information). All subsequent packets that are unknown will then be in established state since they will fallback to proto_generic and will match the 'generic' entry. Our originally proposed version [1] completely disabled generic protocol tracking, but Jozsef suggests to not track protocols for which a more suitable helper is available, hence we now mitigate the issue for in tree known ct protocol helpers only, so that at least NAT and direction information will still be preserved for others. [1] http://www.spinics.net/lists/netfilter-devel/msg33430.html Joint work with Daniel Borkmann. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_generic.c | 26 +++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index d25f29377648..957c1db66652 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -14,6 +14,30 @@ static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; +static bool nf_generic_should_process(u8 proto) +{ + switch (proto) { +#ifdef CONFIG_NF_CT_PROTO_SCTP_MODULE + case IPPROTO_SCTP: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP_MODULE + case IPPROTO_DCCP: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE + case IPPROTO_GRE: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE_MODULE + case IPPROTO_UDPLITE: + return false; +#endif + default: + return true; + } +} + static inline struct nf_generic_net *generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; @@ -67,7 +91,7 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return true; + return nf_generic_should_process(nf_ct_protonum(ct)); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)