81c3d5470e
Arnaldo and I agreed it could be applied now, because I have other pending patches depending on this one (Thank you Arnaldo) (The other important patch moves skc_refcnt in a separate cache line, so that the SMP/NUMA performance doesnt suffer from cache line ping pongs) 1) First some performance data : -------------------------------- tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established() The most time critical code is : sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } The sk_for_each() does use prefetch() hints but only the begining of "struct sock" is prefetched. As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far away from the begining of "struct sock", it has to bring into CPU cache cold cache line. Each iteration has to use at least 2 cache lines. This can be problematic if some chains are very long. 2) The goal ----------- The idea I had is to change things so that INET_MATCH() may return FALSE in 99% of cases only using the data already in the CPU cache, using one cache line per iteration. 3) Description of the patch --------------------------- Adds a new 'unsigned int skc_hash' field in 'struct sock_common', filling a 32 bits hole on 64 bits platform. struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse; int skc_bound_dev_if; struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + unsigned int skc_hash; struct proto *skc_prot; }; Store in this 32 bits field the full hash, not masked by (ehash_size - 1) Using this full hash as the first comparison done in INET_MATCH permits us immediatly skip the element without touching a second cache line in case of a miss. Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to sk_hash and tw_hash) already contains the slot number if we mask with (ehash_size - 1) File include/net/inet_hashtables.h 64 bits platforms : #define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) ((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 32bits platforms: #define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) && \ (inet_sk(__sk)->daddr == (__saddr)) && \ (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) - Adds a prefetch(head->chain.first) in __inet_lookup_established()/__tcp_v4_check_established() and __inet6_lookup_established()/__tcp_v6_check_established() and __dccp_v4_check_established() to bring into cache the first element of the list, before the {read|write}_lock(&head->lock); Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Signed-off-by: David S. Miller <davem@davemloft.net>
385 lines
8.3 KiB
C
385 lines
8.3 KiB
C
#ifndef _IPV6_H
|
|
#define _IPV6_H
|
|
|
|
#include <linux/config.h>
|
|
#include <linux/in6.h>
|
|
#include <asm/byteorder.h>
|
|
|
|
/* The latest drafts declared increase in minimal mtu up to 1280. */
|
|
|
|
#define IPV6_MIN_MTU 1280
|
|
|
|
/*
|
|
* Advanced API
|
|
* source interface/address selection, source routing, etc...
|
|
* *under construction*
|
|
*/
|
|
|
|
|
|
struct in6_pktinfo {
|
|
struct in6_addr ipi6_addr;
|
|
int ipi6_ifindex;
|
|
};
|
|
|
|
|
|
struct in6_ifreq {
|
|
struct in6_addr ifr6_addr;
|
|
__u32 ifr6_prefixlen;
|
|
int ifr6_ifindex;
|
|
};
|
|
|
|
#define IPV6_SRCRT_STRICT 0x01 /* this hop must be a neighbor */
|
|
#define IPV6_SRCRT_TYPE_0 0 /* IPv6 type 0 Routing Header */
|
|
|
|
/*
|
|
* routing header
|
|
*/
|
|
struct ipv6_rt_hdr {
|
|
__u8 nexthdr;
|
|
__u8 hdrlen;
|
|
__u8 type;
|
|
__u8 segments_left;
|
|
|
|
/*
|
|
* type specific data
|
|
* variable length field
|
|
*/
|
|
};
|
|
|
|
|
|
struct ipv6_opt_hdr {
|
|
__u8 nexthdr;
|
|
__u8 hdrlen;
|
|
/*
|
|
* TLV encoded option data follows.
|
|
*/
|
|
};
|
|
|
|
#define ipv6_destopt_hdr ipv6_opt_hdr
|
|
#define ipv6_hopopt_hdr ipv6_opt_hdr
|
|
|
|
#ifdef __KERNEL__
|
|
#define ipv6_optlen(p) (((p)->hdrlen+1) << 3)
|
|
#endif
|
|
|
|
/*
|
|
* routing header type 0 (used in cmsghdr struct)
|
|
*/
|
|
|
|
struct rt0_hdr {
|
|
struct ipv6_rt_hdr rt_hdr;
|
|
__u32 reserved;
|
|
struct in6_addr addr[0];
|
|
|
|
#define rt0_type rt_hdr.type
|
|
};
|
|
|
|
struct ipv6_auth_hdr {
|
|
__u8 nexthdr;
|
|
__u8 hdrlen; /* This one is measured in 32 bit units! */
|
|
__u16 reserved;
|
|
__u32 spi;
|
|
__u32 seq_no; /* Sequence number */
|
|
__u8 auth_data[0]; /* Length variable but >=4. Mind the 64 bit alignment! */
|
|
};
|
|
|
|
struct ipv6_esp_hdr {
|
|
__u32 spi;
|
|
__u32 seq_no; /* Sequence number */
|
|
__u8 enc_data[0]; /* Length variable but >=8. Mind the 64 bit alignment! */
|
|
};
|
|
|
|
struct ipv6_comp_hdr {
|
|
__u8 nexthdr;
|
|
__u8 flags;
|
|
__u16 cpi;
|
|
};
|
|
|
|
/*
|
|
* IPv6 fixed header
|
|
*
|
|
* BEWARE, it is incorrect. The first 4 bits of flow_lbl
|
|
* are glued to priority now, forming "class".
|
|
*/
|
|
|
|
struct ipv6hdr {
|
|
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
__u8 priority:4,
|
|
version:4;
|
|
#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
__u8 version:4,
|
|
priority:4;
|
|
#else
|
|
#error "Please fix <asm/byteorder.h>"
|
|
#endif
|
|
__u8 flow_lbl[3];
|
|
|
|
__u16 payload_len;
|
|
__u8 nexthdr;
|
|
__u8 hop_limit;
|
|
|
|
struct in6_addr saddr;
|
|
struct in6_addr daddr;
|
|
};
|
|
|
|
/*
|
|
* This structure contains configuration options per IPv6 link.
|
|
*/
|
|
struct ipv6_devconf {
|
|
__s32 forwarding;
|
|
__s32 hop_limit;
|
|
__s32 mtu6;
|
|
__s32 accept_ra;
|
|
__s32 accept_redirects;
|
|
__s32 autoconf;
|
|
__s32 dad_transmits;
|
|
__s32 rtr_solicits;
|
|
__s32 rtr_solicit_interval;
|
|
__s32 rtr_solicit_delay;
|
|
__s32 force_mld_version;
|
|
#ifdef CONFIG_IPV6_PRIVACY
|
|
__s32 use_tempaddr;
|
|
__s32 temp_valid_lft;
|
|
__s32 temp_prefered_lft;
|
|
__s32 regen_max_retry;
|
|
__s32 max_desync_factor;
|
|
#endif
|
|
__s32 max_addresses;
|
|
void *sysctl;
|
|
};
|
|
|
|
/* index values for the variables in ipv6_devconf */
|
|
enum {
|
|
DEVCONF_FORWARDING = 0,
|
|
DEVCONF_HOPLIMIT,
|
|
DEVCONF_MTU6,
|
|
DEVCONF_ACCEPT_RA,
|
|
DEVCONF_ACCEPT_REDIRECTS,
|
|
DEVCONF_AUTOCONF,
|
|
DEVCONF_DAD_TRANSMITS,
|
|
DEVCONF_RTR_SOLICITS,
|
|
DEVCONF_RTR_SOLICIT_INTERVAL,
|
|
DEVCONF_RTR_SOLICIT_DELAY,
|
|
DEVCONF_USE_TEMPADDR,
|
|
DEVCONF_TEMP_VALID_LFT,
|
|
DEVCONF_TEMP_PREFERED_LFT,
|
|
DEVCONF_REGEN_MAX_RETRY,
|
|
DEVCONF_MAX_DESYNC_FACTOR,
|
|
DEVCONF_MAX_ADDRESSES,
|
|
DEVCONF_FORCE_MLD_VERSION,
|
|
DEVCONF_MAX
|
|
};
|
|
|
|
#ifdef __KERNEL__
|
|
#include <linux/in6.h> /* struct sockaddr_in6 */
|
|
#include <linux/icmpv6.h>
|
|
#include <net/if_inet6.h> /* struct ipv6_mc_socklist */
|
|
#include <linux/tcp.h>
|
|
#include <linux/udp.h>
|
|
|
|
/*
|
|
This structure contains results of exthdrs parsing
|
|
as offsets from skb->nh.
|
|
*/
|
|
|
|
struct inet6_skb_parm {
|
|
int iif;
|
|
__u16 ra;
|
|
__u16 hop;
|
|
__u16 dst0;
|
|
__u16 srcrt;
|
|
__u16 dst1;
|
|
__u16 lastopt;
|
|
};
|
|
|
|
#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
|
|
|
|
static inline int inet6_iif(const struct sk_buff *skb)
|
|
{
|
|
return IP6CB(skb)->iif;
|
|
}
|
|
|
|
struct tcp6_request_sock {
|
|
struct tcp_request_sock req;
|
|
struct in6_addr loc_addr;
|
|
struct in6_addr rmt_addr;
|
|
struct sk_buff *pktopts;
|
|
int iif;
|
|
};
|
|
|
|
static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk)
|
|
{
|
|
return (struct tcp6_request_sock *)sk;
|
|
}
|
|
|
|
/**
|
|
* struct ipv6_pinfo - ipv6 private area
|
|
*
|
|
* In the struct sock hierarchy (tcp6_sock, upd6_sock, etc)
|
|
* this _must_ be the last member, so that inet6_sk_generic
|
|
* is able to calculate its offset from the base struct sock
|
|
* by using the struct proto->slab_obj_size member. -acme
|
|
*/
|
|
struct ipv6_pinfo {
|
|
struct in6_addr saddr;
|
|
struct in6_addr rcv_saddr;
|
|
struct in6_addr daddr;
|
|
struct in6_addr *daddr_cache;
|
|
|
|
__u32 flow_label;
|
|
__u32 frag_size;
|
|
__s16 hop_limit;
|
|
__s16 mcast_hops;
|
|
int mcast_oif;
|
|
|
|
/* pktoption flags */
|
|
union {
|
|
struct {
|
|
__u16 srcrt:2,
|
|
osrcrt:2,
|
|
rxinfo:1,
|
|
rxoinfo:1,
|
|
rxhlim:1,
|
|
rxohlim:1,
|
|
hopopts:1,
|
|
ohopopts:1,
|
|
dstopts:1,
|
|
odstopts:1,
|
|
rxflow:1,
|
|
rxtclass:1;
|
|
} bits;
|
|
__u16 all;
|
|
} rxopt;
|
|
|
|
/* sockopt flags */
|
|
__u8 mc_loop:1,
|
|
recverr:1,
|
|
sndflow:1,
|
|
pmtudisc:2,
|
|
ipv6only:1;
|
|
__u8 tclass;
|
|
|
|
__u32 dst_cookie;
|
|
|
|
struct ipv6_mc_socklist *ipv6_mc_list;
|
|
struct ipv6_ac_socklist *ipv6_ac_list;
|
|
struct ipv6_fl_socklist *ipv6_fl_list;
|
|
|
|
struct ipv6_txoptions *opt;
|
|
struct sk_buff *pktoptions;
|
|
struct {
|
|
struct ipv6_txoptions *opt;
|
|
struct rt6_info *rt;
|
|
int hop_limit;
|
|
int tclass;
|
|
} cork;
|
|
};
|
|
|
|
/* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
|
|
struct raw6_sock {
|
|
/* inet_sock has to be the first member of raw6_sock */
|
|
struct inet_sock inet;
|
|
__u32 checksum; /* perform checksum */
|
|
__u32 offset; /* checksum offset */
|
|
struct icmp6_filter filter;
|
|
/* ipv6_pinfo has to be the last member of raw6_sock, see inet6_sk_generic */
|
|
struct ipv6_pinfo inet6;
|
|
};
|
|
|
|
struct udp6_sock {
|
|
struct udp_sock udp;
|
|
/* ipv6_pinfo has to be the last member of udp6_sock, see inet6_sk_generic */
|
|
struct ipv6_pinfo inet6;
|
|
};
|
|
|
|
struct tcp6_sock {
|
|
struct tcp_sock tcp;
|
|
/* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */
|
|
struct ipv6_pinfo inet6;
|
|
};
|
|
|
|
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
|
|
static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
|
|
{
|
|
return inet_sk(__sk)->pinet6;
|
|
}
|
|
|
|
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
|
|
{
|
|
return (struct raw6_sock *)sk;
|
|
}
|
|
|
|
static inline void inet_sk_copy_descendant(struct sock *sk_to,
|
|
const struct sock *sk_from)
|
|
{
|
|
int ancestor_size = sizeof(struct inet_sock);
|
|
|
|
if (sk_from->sk_family == PF_INET6)
|
|
ancestor_size += sizeof(struct ipv6_pinfo);
|
|
|
|
__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
|
|
}
|
|
|
|
#define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only)
|
|
#define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
|
|
|
|
#include <linux/tcp.h>
|
|
|
|
struct tcp6_timewait_sock {
|
|
struct tcp_timewait_sock tw_v6_sk;
|
|
struct in6_addr tw_v6_daddr;
|
|
struct in6_addr tw_v6_rcv_saddr;
|
|
};
|
|
|
|
static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
|
|
{
|
|
return (struct tcp6_timewait_sock *)sk;
|
|
}
|
|
|
|
static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
|
|
{
|
|
return likely(sk->sk_state != TCP_TIME_WAIT) ?
|
|
&inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
|
|
}
|
|
|
|
static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
|
|
{
|
|
return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
|
|
}
|
|
|
|
static inline int inet_v6_ipv6only(const struct sock *sk)
|
|
{
|
|
return likely(sk->sk_state != TCP_TIME_WAIT) ?
|
|
ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only;
|
|
}
|
|
#else
|
|
#define __ipv6_only_sock(sk) 0
|
|
#define ipv6_only_sock(sk) 0
|
|
|
|
static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
#define __tcp_v6_rcv_saddr(__sk) NULL
|
|
#define tcp_v6_rcv_saddr(__sk) NULL
|
|
#define tcp_twsk_ipv6only(__sk) 0
|
|
#define inet_v6_ipv6only(__sk) 0
|
|
#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
|
|
|
|
#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\
|
|
(((__sk)->sk_hash == (__hash)) && \
|
|
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
|
|
((__sk)->sk_family == AF_INET6) && \
|
|
ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
|
|
ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
|
|
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _IPV6_H */
|