From 9846ada138accc63994b57ebdfa76e3e137729e2 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 8 Mar 2011 15:37:27 +0100 Subject: [PATCH 01/37] netfilter: ipset: fix the compile warning in ip_set_create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/netfilter/ipset/ip_set_core.c:615: warning: ‘clash’ may be used uninitialized in this function Signed-off-by: Shan Wei Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- net/netfilter/ipset/ip_set_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 8b1a54c1e400..618a615acc9d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -612,7 +612,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { - struct ip_set *set, *clash; + struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; const char *name, *typename; From adb00ae2ea0ec65f9d3d06079950c0f0ade3b614 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 9 Mar 2011 14:14:26 +0100 Subject: [PATCH 02/37] netfilter: x_tables: misuse of try_then_request_module Since xt_find_match() returns ERR_PTR(xx) on error not NULL, the macro try_then_request_module won't work correctly here. The macro expects its first argument will be zero if condition fails. But ERR_PTR(-ENOENT) is not zero. The correct solution is to propagate the error value back. Found by inspection, and compile tested only. Signed-off-by: Stephen Hemminger Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 0a77d2ff2154..271eed32a6a1 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -183,7 +183,7 @@ EXPORT_SYMBOL(xt_unregister_matches); /* * These are weird, but module loading must not be done with mutex * held (since they will register), and we have to have a single - * function to use try_then_request_module(). + * function to use. */ /* Find match, grabs ref. Returns ERR_PTR() on error. */ @@ -221,9 +221,13 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) { struct xt_match *match; - match = try_then_request_module(xt_find_match(nfproto, name, revision), - "%st_%s", xt_prefix[nfproto], name); - return (match != NULL) ? match : ERR_PTR(-ENOENT); + match = xt_find_match(nfproto, name, revision); + if (IS_ERR(match)) { + request_module("%st_%s", xt_prefix[nfproto], name); + match = xt_find_match(nfproto, name, revision); + } + + return match; } EXPORT_SYMBOL_GPL(xt_request_find_match); @@ -261,9 +265,13 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; - target = try_then_request_module(xt_find_target(af, name, revision), - "%st_%s", xt_prefix[af], name); - return (target != NULL) ? target : ERR_PTR(-ENOENT); + target = xt_find_target(af, name, revision); + if (IS_ERR(target)) { + request_module("%st_%s", xt_prefix[af], name); + target = xt_find_target(af, name, revision); + } + + return target; } EXPORT_SYMBOL_GPL(xt_request_find_target); From 42046e2e45c109ba703993c510401a11f716c8df Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 14 Mar 2011 19:11:44 +0100 Subject: [PATCH 03/37] netfilter: x_tables: return -ENOENT for non-existant matches/targets As Stephen correctly points out, we need to return -ENOENT in xt_find_match()/xt_find_target() after the patch "netfilter: x_tables: misuse of try_then_request_module" in order to properly indicate a non-existant module to the caller. Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 271eed32a6a1..a9adf4c6b299 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -190,7 +190,7 @@ EXPORT_SYMBOL(xt_unregister_matches); struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) { struct xt_match *m; - int err = 0; + int err = -ENOENT; if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); @@ -235,7 +235,7 @@ EXPORT_SYMBOL_GPL(xt_request_find_match); struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; - int err = 0; + int err = -ENOENT; if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); From fe8f661f2c2bb058822f13f6f232e121bde1338f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 14 Mar 2011 19:20:44 +0100 Subject: [PATCH 04/37] netfilter: nf_conntrack: fix sysctl memory leak Message in log because sysctl table was not empty at netns exit WARNING: at net/sysctl_net.c:84 sysctl_net_exit+0x2a/0x2c() Instrumenting showed that the nf_conntrack_timestamp was the entry that was being created but not cleared. Signed-off-by: Stephen Hemminger Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1909311c392a..118123606a21 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1294,6 +1294,7 @@ static void nf_conntrack_cleanup_net(struct net *net) nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_ecache_fini(net); + nf_conntrack_tstamp_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); kmem_cache_destroy(net->ct.nf_conntrack_cachep); From 097fc76a0805bdca17baf12cad9d3bcb215716a9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 4 Mar 2011 12:26:17 +0200 Subject: [PATCH 05/37] ipvs: avoid lookup for fwmark 0 Restore the previous behaviour to lookup for fwmark service only when fwmark is non-null. This saves only CPU. Signed-off-by: Julian Anastasov Signed-off-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c73b0c831a2d..f0369d665088 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -411,9 +411,11 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, /* * Check the table hashed by fwmark first */ - svc = __ip_vs_svc_fwm_find(net, af, fwmark); - if (fwmark && svc) - goto out; + if (fwmark) { + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (svc) + goto out; + } /* * Check the table hashed by From 4a569c0c0f833adace1e3aadaa38780ec2fcdf9e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 4 Mar 2011 12:28:20 +0200 Subject: [PATCH 06/37] ipvs: remove _bh from percpu stats reading ip_vs_read_cpu_stats is called only from timer, so no need for _bh locks. Signed-off-by: Julian Anastasov Signed-off-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_est.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index f560a05c965a..88bd71647bf5 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -69,10 +69,10 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, sum->inpkts += s->ustats.inpkts; sum->outpkts += s->ustats.outpkts; do { - start = u64_stats_fetch_begin_bh(&s->syncp); + start = u64_stats_fetch_begin(&s->syncp); inbytes = s->ustats.inbytes; outbytes = s->ustats.outbytes; - } while (u64_stats_fetch_retry_bh(&s->syncp, start)); + } while (u64_stats_fetch_retry(&s->syncp, start)); sum->inbytes += inbytes; sum->outbytes += outbytes; } else { @@ -80,10 +80,10 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, sum->inpkts = s->ustats.inpkts; sum->outpkts = s->ustats.outpkts; do { - start = u64_stats_fetch_begin_bh(&s->syncp); + start = u64_stats_fetch_begin(&s->syncp); sum->inbytes = s->ustats.inbytes; sum->outbytes = s->ustats.outbytes; - } while (u64_stats_fetch_retry_bh(&s->syncp, start)); + } while (u64_stats_fetch_retry(&s->syncp, start)); } } } From 6060c74a3de8ed142c78133e2829e74711f77387 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Mon, 7 Mar 2011 10:11:34 +0800 Subject: [PATCH 07/37] netfilter:ipvs: use kmemdup The semantic patch that makes this output is available in scripts/coccinelle/api/memdup.cocci. More information about semantic patching is available at http://coccinelle.lip6.fr/ Signed-off-by: Shan Wei Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_pe_sip.c | 9 ++++----- net/netfilter/ipvs/ip_vs_sync.c | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0d83bc01fed4..13d607ae9c52 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -92,14 +92,13 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) return -EINVAL; - p->pe_data = kmalloc(matchlen, GFP_ATOMIC); - if (!p->pe_data) - return -ENOMEM; - /* N.B: pe_data is only set on success, * this allows fallback to the default persistence logic on failure */ - memcpy(p->pe_data, dptr + matchoff, matchlen); + p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + p->pe_data_len = matchlen; return 0; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index fecf24de4af3..c5d13b05275a 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -697,13 +697,12 @@ ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, return 1; } - p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC); + p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); if (!p->pe_data) { if (p->pe->module) module_put(p->pe->module); return -ENOMEM; } - memcpy(p->pe_data, pe_data, pe_data_len); p->pe_data_len = pe_data_len; } return 0; From 06b69390a652bfe4fa7e18e27c938e75ffe86ba0 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Wed, 9 Mar 2011 22:55:05 +0100 Subject: [PATCH 08/37] IPVS: Fix variable assignment in ip_vs_notrack There's no sense to 'ct = ct = ' in ip_vs_notrack(). Just assign nf_ct_get()'s return value directly to the pointer variable 'ct' once. Signed-off-by: Jesper Juhl Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e74da41ebd1b..1dcb75da313d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1190,7 +1190,7 @@ static inline void ip_vs_notrack(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (!ct || !nf_ct_is_untracked(ct)) { nf_reset(skb); From 2553d064ff4bf999f369c8c3dfacaa797dbef1d9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 4 Mar 2011 12:18:07 +0200 Subject: [PATCH 09/37] ipvs: move struct netns_ipvs Remove include/net/netns/ip_vs.h because it depends on structures from include/net/ip_vs.h. As ipvs is pointer in struct net it is better to move struct netns_ipvs into include/net/ip_vs.h, so that we can easily use other structures in struct netns_ipvs. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 122 ++++++++++++++++++++++++++++++ include/net/net_namespace.h | 2 +- include/net/netns/ip_vs.h | 143 ------------------------------------ 3 files changed, 123 insertions(+), 144 deletions(-) delete mode 100644 include/net/netns/ip_vs.h diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 1dcb75da313d..091ca1f76e0e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -803,6 +803,128 @@ struct ip_vs_app { void (*timeout_change)(struct ip_vs_app *app, int flags); }; +/* IPVS in network namespace */ +struct netns_ipvs { + int gen; /* Generation */ + /* + * Hash table: for real service lookups + */ + #define IP_VS_RTAB_BITS 4 + #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) + #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + + struct list_head rs_table[IP_VS_RTAB_SIZE]; + /* ip_vs_app */ + struct list_head app_list; + struct mutex app_mutex; + struct lock_class_key app_key; /* mutex debuging */ + + /* ip_vs_proto */ + #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ + struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; + /* ip_vs_proto_tcp */ +#ifdef CONFIG_IP_VS_PROTO_TCP + #define TCP_APP_TAB_BITS 4 + #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) + #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + struct list_head tcp_apps[TCP_APP_TAB_SIZE]; + spinlock_t tcp_app_lock; +#endif + /* ip_vs_proto_udp */ +#ifdef CONFIG_IP_VS_PROTO_UDP + #define UDP_APP_TAB_BITS 4 + #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) + #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + struct list_head udp_apps[UDP_APP_TAB_SIZE]; + spinlock_t udp_app_lock; +#endif + /* ip_vs_proto_sctp */ +#ifdef CONFIG_IP_VS_PROTO_SCTP + #define SCTP_APP_TAB_BITS 4 + #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) + #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) + /* Hash table for SCTP application incarnations */ + struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; + spinlock_t sctp_app_lock; +#endif + /* ip_vs_conn */ + atomic_t conn_count; /* connection counter */ + + /* ip_vs_ctl */ + struct ip_vs_stats *tot_stats; /* Statistics & est. */ + struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ + seqcount_t *ustats_seq; /* u64 read retry */ + + int num_services; /* no of virtual services */ + /* 1/rate drop and drop-entry variables */ + struct delayed_work defense_work; /* Work handler */ + int drop_rate; + int drop_counter; + atomic_t dropentry; + /* locks in ctl.c */ + spinlock_t dropentry_lock; /* drop entry handling */ + spinlock_t droppacket_lock; /* drop packet handling */ + spinlock_t securetcp_lock; /* state and timeout tables */ + rwlock_t rs_lock; /* real services table */ + /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ + struct lock_class_key ctl_key; /* ctl_mutex debuging */ + /* Trash for destinations */ + struct list_head dest_trash; + /* Service counters */ + atomic_t ftpsvc_counter; + atomic_t nullsvc_counter; + + /* sys-ctl struct */ + struct ctl_table_header *sysctl_hdr; + struct ctl_table *sysctl_tbl; + /* sysctl variables */ + int sysctl_amemthresh; + int sysctl_am_droprate; + int sysctl_drop_entry; + int sysctl_drop_packet; + int sysctl_secure_tcp; +#ifdef CONFIG_IP_VS_NFCT + int sysctl_conntrack; +#endif + int sysctl_snat_reroute; + int sysctl_sync_ver; + int sysctl_cache_bypass; + int sysctl_expire_nodest_conn; + int sysctl_expire_quiescent_template; + int sysctl_sync_threshold[2]; + int sysctl_nat_icmp_send; + + /* ip_vs_lblc */ + int sysctl_lblc_expiration; + struct ctl_table_header *lblc_ctl_header; + struct ctl_table *lblc_ctl_table; + /* ip_vs_lblcr */ + int sysctl_lblcr_expiration; + struct ctl_table_header *lblcr_ctl_header; + struct ctl_table *lblcr_ctl_table; + /* ip_vs_est */ + struct list_head est_list; /* estimator list */ + spinlock_t est_lock; + struct timer_list est_timer; /* Estimation timer */ + /* ip_vs_sync */ + struct list_head sync_queue; + spinlock_t sync_lock; + struct ip_vs_sync_buff *sync_buff; + spinlock_t sync_buff_lock; + struct sockaddr_in sync_mcast_addr; + struct task_struct *master_thread; + struct task_struct *backup_thread; + int send_mesg_maxlen; + int recv_mesg_maxlen; + volatile int sync_state; + volatile int master_syncid; + volatile int backup_syncid; + /* multicast interface name */ + char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + /* net name space ptr */ + struct net *net; /* Needed by timer routines */ +}; /* * IPVS core functions diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b3b4a34cb2cc..3ae491932bc8 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -20,7 +20,6 @@ #include #endif #include -#include struct proc_dir_entry; struct net_device; @@ -28,6 +27,7 @@ struct sock; struct ctl_table_header; struct net_generic; struct sock; +struct netns_ipvs; #define NETDEV_HASHBITS 8 diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h deleted file mode 100644 index 259ebac904bf..000000000000 --- a/include/net/netns/ip_vs.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * IP Virtual Server - * Data structure for network namspace - * - */ - -#ifndef IP_VS_H_ -#define IP_VS_H_ - -#include -#include -#include -#include -#include -#include - -struct ip_vs_stats; -struct ip_vs_sync_buff; -struct ctl_table_header; - -struct netns_ipvs { - int gen; /* Generation */ - /* - * Hash table: for real service lookups - */ - #define IP_VS_RTAB_BITS 4 - #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) - #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - - struct list_head rs_table[IP_VS_RTAB_SIZE]; - /* ip_vs_app */ - struct list_head app_list; - struct mutex app_mutex; - struct lock_class_key app_key; /* mutex debuging */ - - /* ip_vs_proto */ - #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ - struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; - /* ip_vs_proto_tcp */ -#ifdef CONFIG_IP_VS_PROTO_TCP - #define TCP_APP_TAB_BITS 4 - #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) - #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - struct list_head tcp_apps[TCP_APP_TAB_SIZE]; - spinlock_t tcp_app_lock; -#endif - /* ip_vs_proto_udp */ -#ifdef CONFIG_IP_VS_PROTO_UDP - #define UDP_APP_TAB_BITS 4 - #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) - #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - struct list_head udp_apps[UDP_APP_TAB_SIZE]; - spinlock_t udp_app_lock; -#endif - /* ip_vs_proto_sctp */ -#ifdef CONFIG_IP_VS_PROTO_SCTP - #define SCTP_APP_TAB_BITS 4 - #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) - #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) - /* Hash table for SCTP application incarnations */ - struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; - spinlock_t sctp_app_lock; -#endif - /* ip_vs_conn */ - atomic_t conn_count; /* connection counter */ - - /* ip_vs_ctl */ - struct ip_vs_stats *tot_stats; /* Statistics & est. */ - struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ - seqcount_t *ustats_seq; /* u64 read retry */ - - int num_services; /* no of virtual services */ - /* 1/rate drop and drop-entry variables */ - struct delayed_work defense_work; /* Work handler */ - int drop_rate; - int drop_counter; - atomic_t dropentry; - /* locks in ctl.c */ - spinlock_t dropentry_lock; /* drop entry handling */ - spinlock_t droppacket_lock; /* drop packet handling */ - spinlock_t securetcp_lock; /* state and timeout tables */ - rwlock_t rs_lock; /* real services table */ - /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ - struct lock_class_key ctl_key; /* ctl_mutex debuging */ - /* Trash for destinations */ - struct list_head dest_trash; - /* Service counters */ - atomic_t ftpsvc_counter; - atomic_t nullsvc_counter; - - /* sys-ctl struct */ - struct ctl_table_header *sysctl_hdr; - struct ctl_table *sysctl_tbl; - /* sysctl variables */ - int sysctl_amemthresh; - int sysctl_am_droprate; - int sysctl_drop_entry; - int sysctl_drop_packet; - int sysctl_secure_tcp; -#ifdef CONFIG_IP_VS_NFCT - int sysctl_conntrack; -#endif - int sysctl_snat_reroute; - int sysctl_sync_ver; - int sysctl_cache_bypass; - int sysctl_expire_nodest_conn; - int sysctl_expire_quiescent_template; - int sysctl_sync_threshold[2]; - int sysctl_nat_icmp_send; - - /* ip_vs_lblc */ - int sysctl_lblc_expiration; - struct ctl_table_header *lblc_ctl_header; - struct ctl_table *lblc_ctl_table; - /* ip_vs_lblcr */ - int sysctl_lblcr_expiration; - struct ctl_table_header *lblcr_ctl_header; - struct ctl_table *lblcr_ctl_table; - /* ip_vs_est */ - struct list_head est_list; /* estimator list */ - spinlock_t est_lock; - struct timer_list est_timer; /* Estimation timer */ - /* ip_vs_sync */ - struct list_head sync_queue; - spinlock_t sync_lock; - struct ip_vs_sync_buff *sync_buff; - spinlock_t sync_buff_lock; - struct sockaddr_in sync_mcast_addr; - struct task_struct *master_thread; - struct task_struct *backup_thread; - int send_mesg_maxlen; - int recv_mesg_maxlen; - volatile int sync_state; - volatile int master_syncid; - volatile int backup_syncid; - /* multicast interface name */ - char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - /* net name space ptr */ - struct net *net; /* Needed by timer routines */ -}; - -#endif /* IP_VS_H_ */ From 2a0751af09c3099cf2837c623ca5d0436317d02d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 4 Mar 2011 12:20:35 +0200 Subject: [PATCH 10/37] ipvs: reorganize tot_stats The global tot_stats contains cpustats field just like the stats for dest and svc, so better use it to simplify the usage in estimation_timer. As tot_stats is registered as estimator we can remove the special ip_vs_read_cpu_stats call for tot_stats. Fix ip_vs_read_cpu_stats to be called under stats lock because it is still used as synchronization between estimation timer and user context (the stats readers). Also, make sure ip_vs_stats_percpu_show reads properly the u64 stats from user context. Signed-off-by: Julian Anastasov Eric Dumazet Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 +-- net/netfilter/ipvs/ip_vs_core.c | 6 ++--- net/netfilter/ipvs/ip_vs_ctl.c | 45 ++++++++++++++++++--------------- net/netfilter/ipvs/ip_vs_est.c | 3 +-- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 091ca1f76e0e..9db750d9082d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -851,8 +851,7 @@ struct netns_ipvs { atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ - struct ip_vs_stats *tot_stats; /* Statistics & est. */ - struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ + struct ip_vs_stats tot_stats; /* Statistics & est. */ seqcount_t *ustats_seq; /* u64 read retry */ int num_services; /* no of virtual services */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2d1f932add46..6f4940e86fc9 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -132,7 +132,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->cpustats); + s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); s->ustats.inbytes += skb->len; @@ -162,7 +162,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->cpustats); + s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); s->ustats.outbytes += skb->len; @@ -183,7 +183,7 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) s = this_cpu_ptr(svc->stats.cpustats); s->ustats.conns++; - s = this_cpu_ptr(ipvs->cpustats); + s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.conns++; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f0369d665088..a2a67ad7e094 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1481,7 +1481,7 @@ static int ip_vs_zero_all(struct net *net) } } - ip_vs_zero_stats(net_ipvs(net)->tot_stats); + ip_vs_zero_stats(&net_ipvs(net)->tot_stats); return 0; } @@ -1963,7 +1963,7 @@ static const struct file_operations ip_vs_info_fops = { static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats; + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -2007,7 +2007,8 @@ static const struct file_operations ip_vs_stats_fops = { static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats; + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ @@ -2017,11 +2018,20 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) "CPU Conns Packets Packets Bytes Bytes\n"); for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i); + struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); + unsigned int start; + __u64 inbytes, outbytes; + + do { + start = u64_stats_fetch_begin_bh(&u->syncp); + inbytes = u->ustats.inbytes; + outbytes = u->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", - i, u->ustats.conns, u->ustats.inpkts, - u->ustats.outpkts, (__u64)u->ustats.inbytes, - (__u64)u->ustats.outbytes); + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)inbytes, + (__u64)outbytes); } spin_lock_bh(&tot_stats->lock); @@ -3505,17 +3515,12 @@ int __net_init __ip_vs_control_init(struct net *net) atomic_set(&ipvs->nullsvc_counter, 0); /* procfs stats */ - ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL); - if (ipvs->tot_stats == NULL) { - pr_err("%s(): no memory.\n", __func__); - return -ENOMEM; - } - ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->cpustats) { + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats.cpustats) { pr_err("%s() alloc_percpu failed\n", __func__); goto err_alloc; } - spin_lock_init(&ipvs->tot_stats->lock); + spin_lock_init(&ipvs->tot_stats.lock); proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); @@ -3563,7 +3568,7 @@ int __net_init __ip_vs_control_init(struct net *net) goto err_dup; } #endif - ip_vs_new_estimator(net, ipvs->tot_stats); + ip_vs_new_estimator(net, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); @@ -3571,9 +3576,8 @@ int __net_init __ip_vs_control_init(struct net *net) return 0; err_dup: - free_percpu(ipvs->cpustats); + free_percpu(ipvs->tot_stats.cpustats); err_alloc: - kfree(ipvs->tot_stats); return -ENOMEM; } @@ -3582,7 +3586,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_kill_estimator(net, ipvs->tot_stats); + ip_vs_kill_estimator(net, &ipvs->tot_stats); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); #ifdef CONFIG_SYSCTL @@ -3591,8 +3595,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) proc_net_remove(net, "ip_vs_stats_percpu"); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); - free_percpu(ipvs->cpustats); - kfree(ipvs->tot_stats); + free_percpu(ipvs->tot_stats.cpustats); } static struct pernet_operations ipvs_control_ops = { diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 88bd71647bf5..b3751cfede2c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -101,13 +101,12 @@ static void estimation_timer(unsigned long arg) struct netns_ipvs *ipvs; ipvs = net_ipvs(net); - ip_vs_read_cpu_stats(&ipvs->tot_stats->ustats, ipvs->cpustats); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); - ip_vs_read_cpu_stats(&s->ustats, s->cpustats); spin_lock(&s->lock); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); n_conns = s->ustats.conns; n_inpkts = s->ustats.inpkts; n_outpkts = s->ustats.outpkts; From 55a3d4e15c7c953ecc55b96b83d2679abf8a7899 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 14 Mar 2011 01:37:49 +0200 Subject: [PATCH 11/37] ipvs: properly zero stats and rates Currently, the new percpu counters are not zeroed and the zero commands do not work as expected, we still show the old sum of percpu values. OTOH, we can not reset the percpu counters from user context without causing the incrementing to use old and bogus values. So, as Eric Dumazet suggested fix that by moving all overhead to stats reading in user context. Do not introduce overhead in timer context (estimator) and incrementing (packet handling in softirqs). The new ustats0 field holds the zero point for all counter values, the rates always use 0 as base value as before. When showing the values to user space just give the difference between counters and the base values. The only drawback is that percpu stats are not zeroed, they are accessible only from /proc and are new interface, so it should not be a compatibility problem as long as the sum stats are correct after zeroing. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: Simon Horman --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 96 +++++++++++++++++++++------------- net/netfilter/ipvs/ip_vs_est.c | 13 ++--- 3 files changed, 68 insertions(+), 42 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 9db750d9082d..06f5af4b626d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -374,6 +374,7 @@ struct ip_vs_stats { struct ip_vs_estimator est; /* estimator */ struct ip_vs_cpu_stats *cpustats; /* per cpu counters */ spinlock_t lock; /* spin lock */ + struct ip_vs_stats_user ustats0; /* reset values */ }; /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index a2a67ad7e094..804fee7be694 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -711,13 +711,51 @@ static void ip_vs_trash_cleanup(struct net *net) } } +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c +#define IP_VS_SHOW_STATS_RATE(r) dst->r = src->ustats.r + + spin_lock_bh(&src->lock); + + IP_VS_SHOW_STATS_COUNTER(conns); + IP_VS_SHOW_STATS_COUNTER(inpkts); + IP_VS_SHOW_STATS_COUNTER(outpkts); + IP_VS_SHOW_STATS_COUNTER(inbytes); + IP_VS_SHOW_STATS_COUNTER(outbytes); + + IP_VS_SHOW_STATS_RATE(cps); + IP_VS_SHOW_STATS_RATE(inpps); + IP_VS_SHOW_STATS_RATE(outpps); + IP_VS_SHOW_STATS_RATE(inbps); + IP_VS_SHOW_STATS_RATE(outbps); + + spin_unlock_bh(&src->lock); +} static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock_bh(&stats->lock); - memset(&stats->ustats, 0, sizeof(stats->ustats)); + /* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c +#define IP_VS_ZERO_STATS_RATE(r) stats->ustats.r = 0 + + IP_VS_ZERO_STATS_COUNTER(conns); + IP_VS_ZERO_STATS_COUNTER(inpkts); + IP_VS_ZERO_STATS_COUNTER(outpkts); + IP_VS_ZERO_STATS_COUNTER(inbytes); + IP_VS_ZERO_STATS_COUNTER(outbytes); + + IP_VS_ZERO_STATS_RATE(cps); + IP_VS_ZERO_STATS_RATE(inpps); + IP_VS_ZERO_STATS_RATE(outpps); + IP_VS_ZERO_STATS_RATE(inbps); + IP_VS_ZERO_STATS_RATE(outbps); + ip_vs_zero_estimator(stats); spin_unlock_bh(&stats->lock); @@ -1963,7 +2001,7 @@ static const struct file_operations ip_vs_info_fops = { static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_stats_user show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -1971,22 +2009,18 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_printf(seq, " Conns Packets Packets Bytes Bytes\n"); - spin_lock_bh(&tot_stats->lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns, - tot_stats->ustats.inpkts, tot_stats->ustats.outpkts, - (unsigned long long) tot_stats->ustats.inbytes, - (unsigned long long) tot_stats->ustats.outbytes); + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, + show.inpkts, show.outpkts, + (unsigned long long) show.inbytes, + (unsigned long long) show.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - tot_stats->ustats.cps, - tot_stats->ustats.inpps, - tot_stats->ustats.outpps, - tot_stats->ustats.inbps, - tot_stats->ustats.outbps); - spin_unlock_bh(&tot_stats->lock); + seq_printf(seq, "%8X %8X %8X %16X %16X\n", + show.cps, show.inpps, show.outpps, + show.inbps, show.outbps); return 0; } @@ -2297,14 +2331,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) } -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, &src->ustats, sizeof(*dst)); - spin_unlock_bh(&src->lock); -} - static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { @@ -2691,31 +2717,29 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_stats *stats) { + struct ip_vs_stats_user ustats; struct nlattr *nl_stats = nla_nest_start(skb, container_type); if (!nl_stats) return -EMSGSIZE; - spin_lock_bh(&stats->lock); + ip_vs_copy_stats(&ustats, stats); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); - - spin_unlock_bh(&stats->lock); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps); nla_nest_end(skb, nl_stats); return 0; nla_put_failure: - spin_unlock_bh(&stats->lock); nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index b3751cfede2c..a85008796370 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -184,13 +184,14 @@ void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats) void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; + struct ip_vs_stats_user *u = &stats->ustats; - /* set counters zero, caller must hold the stats->lock lock */ - est->last_inbytes = 0; - est->last_outbytes = 0; - est->last_conns = 0; - est->last_inpkts = 0; - est->last_outpkts = 0; + /* reset counters, caller must hold the stats->lock lock */ + est->last_inbytes = u->inbytes; + est->last_outbytes = u->outbytes; + est->last_conns = u->conns; + est->last_inpkts = u->inpkts; + est->last_outpkts = u->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; From 87d68a15e2d5a6bd08e59ec80c7a5073bcabb7c3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 14 Mar 2011 01:39:18 +0200 Subject: [PATCH 12/37] ipvs: remove unused seqcount stats Remove ustats_seq, IPVS_STAT_INC and IPVS_STAT_ADD because they are not used. They were replaced with u64_stats. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 06f5af4b626d..cf014abc23fd 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -377,22 +377,6 @@ struct ip_vs_stats { struct ip_vs_stats_user ustats0; /* reset values */ }; -/* - * Helper Macros for per cpu - * ipvs->tot_stats->ustats.count - */ -#define IPVS_STAT_INC(ipvs, count) \ - __this_cpu_inc((ipvs)->ustats->count) - -#define IPVS_STAT_ADD(ipvs, count, value) \ - do {\ - write_seqcount_begin(per_cpu_ptr((ipvs)->ustats_seq, \ - raw_smp_processor_id())); \ - __this_cpu_add((ipvs)->ustats->count, value); \ - write_seqcount_end(per_cpu_ptr((ipvs)->ustats_seq, \ - raw_smp_processor_id())); \ - } while (0) - struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -853,7 +837,6 @@ struct netns_ipvs { /* ip_vs_ctl */ struct ip_vs_stats tot_stats; /* Statistics & est. */ - seqcount_t *ustats_seq; /* u64 read retry */ int num_services; /* no of virtual services */ /* 1/rate drop and drop-entry variables */ From ea9f22cce9c2530d659f9122819940b69506b2d9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 14 Mar 2011 01:41:54 +0200 Subject: [PATCH 13/37] ipvs: optimize rates reading Move the estimator reading from estimation_timer to user context. ip_vs_read_estimator() will be used to decode the rate values. As the decoded rates are not set by estimation timer there is no need to reset them in ip_vs_zero_stats. There is no need ip_vs_new_estimator() to encode stats to rates, if the destination is in trash both the stats and the rates are inactive. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 ++ net/netfilter/ipvs/ip_vs_ctl.c | 31 ++++++++++++------------------- net/netfilter/ipvs/ip_vs_est.c | 33 +++++++++++++-------------------- 3 files changed, 27 insertions(+), 39 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index cf014abc23fd..e4a39c4e263b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1179,6 +1179,8 @@ extern void ip_vs_estimator_cleanup(void); extern void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats); extern void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats); extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); +extern void ip_vs_read_estimator(struct ip_vs_stats_user *dst, + struct ip_vs_stats *stats); /* * Various IPVS packet transmitters (from ip_vs_xmit.c) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 804fee7be694..c93d806e73bd 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -715,7 +715,6 @@ static void ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c -#define IP_VS_SHOW_STATS_RATE(r) dst->r = src->ustats.r spin_lock_bh(&src->lock); @@ -725,11 +724,7 @@ ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) IP_VS_SHOW_STATS_COUNTER(inbytes); IP_VS_SHOW_STATS_COUNTER(outbytes); - IP_VS_SHOW_STATS_RATE(cps); - IP_VS_SHOW_STATS_RATE(inpps); - IP_VS_SHOW_STATS_RATE(outpps); - IP_VS_SHOW_STATS_RATE(inbps); - IP_VS_SHOW_STATS_RATE(outbps); + ip_vs_read_estimator(dst, src); spin_unlock_bh(&src->lock); } @@ -742,7 +737,6 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) /* get current counters as zero point, rates are zeroed */ #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c -#define IP_VS_ZERO_STATS_RATE(r) stats->ustats.r = 0 IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); @@ -750,12 +744,6 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) IP_VS_ZERO_STATS_COUNTER(inbytes); IP_VS_ZERO_STATS_COUNTER(outbytes); - IP_VS_ZERO_STATS_RATE(cps); - IP_VS_ZERO_STATS_RATE(inpps); - IP_VS_ZERO_STATS_RATE(outpps); - IP_VS_ZERO_STATS_RATE(inbps); - IP_VS_ZERO_STATS_RATE(outbps); - ip_vs_zero_estimator(stats); spin_unlock_bh(&stats->lock); @@ -2043,6 +2031,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_stats_user rates; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ @@ -2069,22 +2058,26 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) } spin_lock_bh(&tot_stats->lock); + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns, tot_stats->ustats.inpkts, tot_stats->ustats.outpkts, (unsigned long long) tot_stats->ustats.inbytes, (unsigned long long) tot_stats->ustats.outbytes); + ip_vs_read_estimator(&rates, tot_stats); + + spin_unlock_bh(&tot_stats->lock); + /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, " %8X %8X %8X %16X %16X\n", - tot_stats->ustats.cps, - tot_stats->ustats.inpps, - tot_stats->ustats.outpps, - tot_stats->ustats.inbps, - tot_stats->ustats.outbps); - spin_unlock_bh(&tot_stats->lock); + rates.cps, + rates.inpps, + rates.outpps, + rates.inbps, + rates.outbps); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index a85008796370..fda75be231e8 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -117,27 +117,22 @@ static void estimation_timer(unsigned long arg) rate = (n_conns - e->last_conns) << 9; e->last_conns = n_conns; e->cps += ((long)rate - (long)e->cps) >> 2; - s->ustats.cps = (e->cps + 0x1FF) >> 10; rate = (n_inpkts - e->last_inpkts) << 9; e->last_inpkts = n_inpkts; e->inpps += ((long)rate - (long)e->inpps) >> 2; - s->ustats.inpps = (e->inpps + 0x1FF) >> 10; rate = (n_outpkts - e->last_outpkts) << 9; e->last_outpkts = n_outpkts; e->outpps += ((long)rate - (long)e->outpps) >> 2; - s->ustats.outpps = (e->outpps + 0x1FF) >> 10; rate = (n_inbytes - e->last_inbytes) << 4; e->last_inbytes = n_inbytes; e->inbps += ((long)rate - (long)e->inbps) >> 2; - s->ustats.inbps = (e->inbps + 0xF) >> 5; rate = (n_outbytes - e->last_outbytes) << 4; e->last_outbytes = n_outbytes; e->outbps += ((long)rate - (long)e->outbps) >> 2; - s->ustats.outbps = (e->outbps + 0xF) >> 5; spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); @@ -151,21 +146,6 @@ void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats) INIT_LIST_HEAD(&est->list); - est->last_conns = stats->ustats.conns; - est->cps = stats->ustats.cps<<10; - - est->last_inpkts = stats->ustats.inpkts; - est->inpps = stats->ustats.inpps<<10; - - est->last_outpkts = stats->ustats.outpkts; - est->outpps = stats->ustats.outpps<<10; - - est->last_inbytes = stats->ustats.inbytes; - est->inbps = stats->ustats.inbps<<5; - - est->last_outbytes = stats->ustats.outbytes; - est->outbps = stats->ustats.outbps<<5; - spin_lock_bh(&ipvs->est_lock); list_add(&est->list, &ipvs->est_list); spin_unlock_bh(&ipvs->est_lock); @@ -199,6 +179,19 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) est->outbps = 0; } +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_stats_user *dst, + struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e = &stats->est; + + dst->cps = (e->cps + 0x1FF) >> 10; + dst->inpps = (e->inpps + 0x1FF) >> 10; + dst->outpps = (e->outpps + 0x1FF) >> 10; + dst->inbps = (e->inbps + 0xF) >> 5; + dst->outbps = (e->outbps + 0xF) >> 5; +} + static int __net_init __ip_vs_estimator_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); From 6ef757f965c9133e82116475eab7f30df391c7fa Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 14 Mar 2011 01:44:28 +0200 Subject: [PATCH 14/37] ipvs: rename estimator functions Rename ip_vs_new_estimator to ip_vs_start_estimator and ip_vs_kill_estimator to ip_vs_stop_estimator to better match their logic. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++-- net/netfilter/ipvs/ip_vs_ctl.c | 12 ++++++------ net/netfilter/ipvs/ip_vs_est.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e4a39c4e263b..7ca5be247256 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1176,8 +1176,8 @@ extern void ip_vs_sync_cleanup(void); */ extern int ip_vs_estimator_init(void); extern void ip_vs_estimator_cleanup(void); -extern void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats); -extern void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats); +extern void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats); +extern void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats); extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); extern void ip_vs_read_estimator(struct ip_vs_stats_user *dst, struct ip_vs_stats *stats); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c93d806e73bd..c5b1234b630e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -802,7 +802,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock(&dest->dst_lock); if (add) - ip_vs_new_estimator(svc->net, &dest->stats); + ip_vs_start_estimator(svc->net, &dest->stats); write_lock_bh(&__ip_vs_svc_lock); @@ -1008,7 +1008,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) { struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_kill_estimator(net, &dest->stats); + ip_vs_stop_estimator(net, &dest->stats); /* * Remove it from the d-linked list with the real services. @@ -1201,7 +1201,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - ip_vs_new_estimator(net, &svc->stats); + ip_vs_start_estimator(net, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1353,7 +1353,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) if (svc->af == AF_INET) ipvs->num_services--; - ip_vs_kill_estimator(svc->net, &svc->stats); + ip_vs_stop_estimator(svc->net, &svc->stats); /* Unbind scheduler */ old_sched = svc->scheduler; @@ -3585,7 +3585,7 @@ int __net_init __ip_vs_control_init(struct net *net) goto err_dup; } #endif - ip_vs_new_estimator(net, &ipvs->tot_stats); + ip_vs_start_estimator(net, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); @@ -3603,7 +3603,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_kill_estimator(net, &ipvs->tot_stats); + ip_vs_stop_estimator(net, &ipvs->tot_stats); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index fda75be231e8..8c8766ca56ad 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -139,7 +139,7 @@ static void estimation_timer(unsigned long arg) mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; @@ -151,7 +151,7 @@ void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats) spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; From ba4fd7e966fa837557a3ec846c5fd15328b212af Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 15/37] IPVS: Add ip_vs_route_me_harder() Add ip_vs_route_me_harder() to avoid repeating the same code twice. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 48 +++++++++++++++------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 6f4940e86fc9..299c7f33e18b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -631,6 +631,24 @@ static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) } #endif +static int ip_vs_route_me_harder(int af, struct sk_buff *skb) +{ + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) + return 1; + } else +#endif + if ((ipvs->sysctl_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + return 1; + + return 0; +} + /* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in @@ -737,7 +755,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl) { - struct netns_ipvs *ipvs; unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != 0) { @@ -759,8 +776,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (!skb_make_writable(skb, offset)) goto out; - ipvs = net_ipvs(skb_net(skb)); - #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_nat_icmp_v6(skb, pp, cp, 1); @@ -768,16 +783,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) - goto out; - } else -#endif - if ((ipvs->sysctl_snat_reroute || - skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto out; + if (ip_vs_route_me_harder(af, skb)) + goto out; /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); @@ -985,7 +992,6 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int ihl) { struct ip_vs_protocol *pp = pd->pp; - struct netns_ipvs *ipvs; IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); @@ -1021,18 +1027,8 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ - ipvs = net_ipvs(skb_net(skb)); - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if ((ipvs->sysctl_snat_reroute || - skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; + if (ip_vs_route_me_harder(af, skb)) + goto drop; IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); From 84b3cee39ff1ffc97f4f6fba8ad26786c1f6d8f5 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 16/37] IPVS: Add sysctl_snat_reroute() In preparation for not including sysctl_snat_reroute in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 299c7f33e18b..1d8a2a226151 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -599,6 +599,20 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } +#ifdef CONFIG_SYSCTL + +static int sysctl_snat_reroute(struct sk_buff *skb) +{ + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + return ipvs->sysctl_snat_reroute; +} + +#else + +static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } + +#endif + __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -633,15 +647,13 @@ static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) static int ip_vs_route_me_harder(int af, struct sk_buff *skb) { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); - #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) + if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) return 1; } else #endif - if ((ipvs->sysctl_snat_reroute || + if ((sysctl_snat_reroute(skb) || skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) return 1; From 0cfa558e2c21644a0dd6c21cfadd8bbeaf9fe1a0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 17/37] IPVS: Add sysctl_nat_icmp_send() In preparation for not including sysctl_nat_icmp_send in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 1d8a2a226151..c9b83728f3ce 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -607,9 +607,16 @@ static int sysctl_snat_reroute(struct sk_buff *skb) return ipvs->sysctl_snat_reroute; } +static int sysctl_nat_icmp_send(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + return ipvs->sysctl_nat_icmp_send; +} + #else static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } +static int sysctl_nat_icmp_send(struct net *net) { return 0; } #endif @@ -1074,7 +1081,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; - struct netns_ipvs *ipvs; EnterFunction(11); @@ -1149,11 +1155,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if the packet belongs to an existing entry */ cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); - ipvs = net_ipvs(net); if (likely(cp)) return handle_response(af, skb, pd, cp, iph.len); - if (ipvs->sysctl_nat_icmp_send && + if (sysctl_nat_icmp_send(net) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { From 59e0350eada0516a810cb780db37746165f1d516 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 18/37] IPVS: Add {sysctl_sync_threshold,period}() In preparation for not including sysctl_sync_threshold in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- include/net/ip_vs.h | 29 +++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_core.c | 10 +++++----- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7ca5be247256..253736db476b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -909,6 +909,35 @@ struct netns_ipvs { struct net *net; /* Needed by timer routines */ }; +#define DEFAULT_SYNC_THRESHOLD 3 +#define DEFAULT_SYNC_PERIOD 50 + +#ifdef CONFIG_SYSCTL + +static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_sync_threshold[0]; +} + +static inline int sysctl_sync_period(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_sync_threshold[1]; +} + +#else + +static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) +{ + return DEFAULT_SYNC_THRESHOLD; +} + +static inline int sysctl_sync_period(struct netns_ipvs *ipvs) +{ + return DEFAULT_SYNC_PERIOD; +} + +#endif + /* * IPVS core functions * (from ip_vs_core.c) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c9b83728f3ce..6a0053d91741 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1613,15 +1613,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) - pkts = ipvs->sysctl_sync_threshold[0]; + pkts = sysctl_sync_threshold(ipvs); else pkts = atomic_add_return(1, &cp->in_pkts); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % ipvs->sysctl_sync_threshold[1] - == ipvs->sysctl_sync_threshold[0])) || + (pkts % sysctl_sync_period(ipvs) + == sysctl_sync_threshold(ipvs))) || (cp->old_state != cp->state && ((cp->state == IP_VS_SCTP_S_CLOSED) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || @@ -1635,8 +1635,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % ipvs->sysctl_sync_threshold[1] - == ipvs->sysctl_sync_threshold[0])) || + (pkts % sysctl_sync_period(ipvs) + == sysctl_sync_threshold(ipvs))) || ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && ((cp->state == IP_VS_TCP_S_FIN_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE) || diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c5b1234b630e..364520f66b7a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3569,8 +3569,8 @@ int __net_init __ip_vs_control_init(struct net *net) tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; - ipvs->sysctl_sync_threshold[0] = 3; - ipvs->sysctl_sync_threshold[1] = 50; + ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; + ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c5d13b05275a..e84987fa1bf8 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -650,7 +650,7 @@ control: if (cp->flags & IP_VS_CONN_F_TEMPLATE) { int pkts = atomic_add_return(1, &cp->in_pkts); - if (pkts % ipvs->sysctl_sync_threshold[1] != 1) + if (pkts % sysctl_sync_period(ipvs) != 1) return; } goto sloop; @@ -794,7 +794,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, if (opt) memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]); + atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); cp->state = state; cp->old_state = cp->state; /* From 7532e8d40ccfdde6667169eeac4fd7778d6eb462 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 19/37] IPVS: Add sysctl_sync_ver() In preparation for not including sysctl_sync_ver in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- include/net/ip_vs.h | 11 +++++++++++ net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 253736db476b..687ef18227cd 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -911,6 +911,7 @@ struct netns_ipvs { #define DEFAULT_SYNC_THRESHOLD 3 #define DEFAULT_SYNC_PERIOD 50 +#define DEFAULT_SYNC_VER 1 #ifdef CONFIG_SYSCTL @@ -924,6 +925,11 @@ static inline int sysctl_sync_period(struct netns_ipvs *ipvs) return ipvs->sysctl_sync_threshold[1]; } +static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_sync_ver; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -936,6 +942,11 @@ static inline int sysctl_sync_period(struct netns_ipvs *ipvs) return DEFAULT_SYNC_PERIOD; } +static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) +{ + return DEFAULT_SYNC_VER; +} + #endif /* diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index e84987fa1bf8..3e7961e85e9c 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -394,7 +394,7 @@ void ip_vs_sync_switch_mode(struct net *net, int mode) if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) return; - if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff) + if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) return; spin_lock_bh(&ipvs->sync_buff_lock); @@ -521,7 +521,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) unsigned int len, pe_name_len, pad; /* Handle old version of the protocol */ - if (ipvs->sysctl_sync_ver == 0) { + if (sysctl_sync_ver(ipvs) == 0) { ip_vs_sync_conn_v0(net, cp); return; } From 71a8ab6cad63b4816711f2ea518755677a870f6f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 20/37] IPVS: Add sysctl_expire_nodest_conn() In preparation for not including sysctl_expire_nodest_conn in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 6a0053d91741..d418bc60d00d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -613,10 +613,16 @@ static int sysctl_nat_icmp_send(struct net *net) return ipvs->sysctl_nat_icmp_send; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_expire_nodest_conn; +} + #else static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -1583,7 +1589,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ - if (ipvs->sysctl_expire_nodest_conn) { + if (sysctl_expire_nodest_conn(ipvs)) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } From 8e1b0b1b560019cafebe45a7d9e6ec1122fedc7b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 21/37] IPVS: Add expire_quiescent_template() In preparation for not including sysctl_expire_quiescent_template in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9c2a517b69c8..f289306cbf12 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -680,6 +680,16 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) atomic_dec(&dest->refcnt); } +static int expire_quiescent_template(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL + return ipvs->sysctl_expire_quiescent_template && + (atomic_read(&dest->weight) == 0); +#else + return 0; +#endif +} /* * Checking if the destination of a connection template is available. @@ -696,8 +706,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct) */ if ((dest == NULL) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (ipvs->sysctl_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { + expire_quiescent_template(ipvs, dest)) { IP_VS_DBG_BUF(9, "check_template: dest not available for " "protocol %s s:%s:%d v:%s:%d " "-> d:%s:%d\n", From b27d777ec54205eb56cf4e873d043a426881c629 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:01 +0900 Subject: [PATCH 22/37] IPVS: Conditinally use sysctl_lblc{r}_expiration In preparation for not including sysctl_lblc{r}_expiration in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblc.c | 16 +++++++++++++--- net/netfilter/ipvs/ip_vs_lblcr.c | 21 +++++++++++++++------ 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 6bf7a807649c..51a27f58030d 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -63,6 +63,8 @@ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) +#define DEFAULT_EXPIRATION (24*60*60*HZ) + /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) @@ -238,6 +240,15 @@ static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) } } +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblc_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { @@ -245,7 +256,6 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) struct ip_vs_lblc_entry *en, *nxt; unsigned long now = jiffies; int i, j; - struct netns_ipvs *ipvs = net_ipvs(svc->net); for (i=0, j=tbl->rover; ibucket[j], list) { if (time_before(now, en->lastuse + - ipvs->sysctl_lblc_expiration)) + sysctl_lblc_expiration(svc))) continue; ip_vs_lblc_free(en); @@ -550,7 +560,7 @@ static int __net_init __ip_vs_lblc_init(struct net *net) return -ENOMEM; } else ipvs->lblc_ctl_table = vs_vars_table; - ipvs->sysctl_lblc_expiration = 24*60*60*HZ; + ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 00631765b92a..7fb919061296 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -63,6 +63,8 @@ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) +#define DEFAULT_EXPIRATION (24*60*60*HZ) + /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) @@ -410,6 +412,15 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) } } +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblcr_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) { @@ -417,15 +428,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en, *nxt; - struct netns_ipvs *ipvs = net_ipvs(svc->net); for (i=0, j=tbl->rover; isched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse - + ipvs->sysctl_lblcr_expiration, now)) + if (time_after(en->lastuse + + sysctl_lblcr_expiration(svc), now)) continue; ip_vs_lblcr_free(en); @@ -650,7 +660,6 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); if (en) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; @@ -662,7 +671,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && time_after(jiffies, en->set.lastmod + - ipvs->sysctl_lblcr_expiration)) { + sysctl_lblcr_expiration(svc))) { struct ip_vs_dest *m; write_lock(&en->set.lock); @@ -746,7 +755,7 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) return -ENOMEM; } else ipvs->lblcr_ctl_table = vs_vars_table; - ipvs->sysctl_lblcr_expiration = 24*60*60*HZ; + ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; #ifdef CONFIG_SYSCTL From 3a1bbf1885e94ecedf1deaaab1ace8409330aa7e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:02 +0900 Subject: [PATCH 23/37] IPVS: ip_vs_todrop() becomes a noop when CONFIG_SYSCTL is undefined Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 687ef18227cd..77ebece7e68c 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1253,6 +1253,7 @@ extern int ip_vs_icmp_xmit_v6 int offset); #endif +#ifdef CONFIG_SYSCTL /* * This is a simple mechanism to ignore packets when * we are loaded. Just set ip_vs_drop_rate to 'n' and @@ -1268,6 +1269,9 @@ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) ipvs->drop_counter = ipvs->drop_rate; return 1; } +#else +static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } +#endif /* * ip_vs_fwd_tag returns the forwarding tag of the connection From a4e2f5a700cb93448b2da0e158149d18dc5290ef Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:02 +0900 Subject: [PATCH 24/37] IPVS: Conditional ip_vs_conntrack_enabled() ip_vs_conntrack_enabled() becomes a noop when CONFIG_SYSCTL is undefined. In preparation for not including sysctl_conntrack in struct netns_ipvs when CONFIG_SYCTL is not defined. Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 77ebece7e68c..299aeb537899 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1359,7 +1359,11 @@ static inline void ip_vs_notrack(struct sk_buff *skb) */ static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { +#ifdef CONFIG_SYSCTL return ipvs->sysctl_conntrack; +#else + return 0; +#endif } extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, From a7a86b8616bc1595c4f5f109b7c39d4eb5d55e32 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:02 +0900 Subject: [PATCH 25/37] IPVS: Minimise ip_vs_leave when CONFIG_SYSCTL is undefined Much of ip_vs_leave() is unnecessary if CONFIG_SYSCTL is undefined. I tried an approach of breaking the now #ifdef'ed portions out into a separate function. However this appeared to grow the compiled code on x86_64 by about 200 bytes in the case where CONFIG_SYSCTL is defined. So I have gone with the simpler though less elegant #ifdef'ed solution for now. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d418bc60d00d..07accf6b2401 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -499,11 +499,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd) { - struct net *net; - struct netns_ipvs *ipvs; __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; +#ifdef CONFIG_SYSCTL + struct net *net; + struct netns_ipvs *ipvs; int unicast; +#endif ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); @@ -512,6 +514,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_service_put(svc); return NF_DROP; } + +#ifdef CONFIG_SYSCTL net = skb_net(skb); #ifdef CONFIG_IP_VS_IPV6 @@ -563,6 +567,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_put(cp); return ret; } +#endif /* * When the virtual ftp service is presented, packets destined From fb1de432c1c7c26afb2e86d166fc37888b6a4423 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:02 +0900 Subject: [PATCH 26/37] IPVS: Conditionally define and use ip_vs_lblc{r}_table ip_vs_lblc_table and ip_vs_lblcr_table, and code that uses them are unnecessary when CONFIG_SYSCTL is undefined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblc.c | 15 ++++++++++----- net/netfilter/ipvs/ip_vs_lblcr.c | 14 ++++++++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 51a27f58030d..f276df9896b3 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -114,7 +114,7 @@ struct ip_vs_lblc_table { /* * IPVS LBLC sysctl table */ - +#ifdef CONFIG_SYSCTL static ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", @@ -125,6 +125,7 @@ static ctl_table vs_vars_table[] = { }, { } }; +#endif static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { @@ -548,6 +549,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = /* * per netns init. */ +#ifdef CONFIG_SYSCTL static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -563,7 +565,6 @@ static int __net_init __ip_vs_lblc_init(struct net *net) ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; -#ifdef CONFIG_SYSCTL ipvs->lblc_ctl_header = register_net_sysctl_table(net, net_vs_ctl_path, ipvs->lblc_ctl_table); @@ -572,7 +573,6 @@ static int __net_init __ip_vs_lblc_init(struct net *net) kfree(ipvs->lblc_ctl_table); return -ENOMEM; } -#endif return 0; } @@ -581,14 +581,19 @@ static void __net_exit __ip_vs_lblc_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); -#ifdef CONFIG_SYSCTL unregister_net_sysctl_table(ipvs->lblc_ctl_header); -#endif if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); } +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + static struct pernet_operations ip_vs_lblc_ops = { .init = __ip_vs_lblc_init, .exit = __ip_vs_lblc_exit, diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 7fb919061296..cb1c9913d38b 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -285,6 +285,7 @@ struct ip_vs_lblcr_table { }; +#ifdef CONFIG_SYSCTL /* * IPVS LBLCR sysctl table */ @@ -299,6 +300,7 @@ static ctl_table vs_vars_table[] = { }, { } }; +#endif static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { @@ -743,6 +745,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = /* * per netns init. */ +#ifdef CONFIG_SYSCTL static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -758,7 +761,6 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; -#ifdef CONFIG_SYSCTL ipvs->lblcr_ctl_header = register_net_sysctl_table(net, net_vs_ctl_path, ipvs->lblcr_ctl_table); @@ -767,7 +769,6 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) kfree(ipvs->lblcr_ctl_table); return -ENOMEM; } -#endif return 0; } @@ -776,14 +777,19 @@ static void __net_exit __ip_vs_lblcr_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); -#ifdef CONFIG_SYSCTL unregister_net_sysctl_table(ipvs->lblcr_ctl_header); -#endif if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); } +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + static struct pernet_operations ip_vs_lblcr_ops = { .init = __ip_vs_lblcr_init, .exit = __ip_vs_lblcr_exit, From 14e405461e664b777e2a5636e10b2ebf36a686ec Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:02 +0900 Subject: [PATCH 27/37] IPVS: Add __ip_vs_control_{init,cleanup}_sysctl() Break out the portions of __ip_vs_control_init() and __ip_vs_control_cleanup() where aren't necessary when CONFIG_SYSCTL is undefined. Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 98 +++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 36 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 364520f66b7a..fa6d44c62de3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -88,6 +88,8 @@ static int __ip_vs_addr_is_local_v6(struct net *net, return 0; } #endif + +#ifdef CONFIG_SYSCTL /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs @@ -229,6 +231,7 @@ static void defense_work_handler(struct work_struct *work) ip_vs_random_dropentry(ipvs->net); schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } +#endif int ip_vs_use_count_inc(void) @@ -1511,7 +1514,7 @@ static int ip_vs_zero_all(struct net *net) return 0; } - +#ifdef CONFIG_SYSCTL static int proc_do_defense_mode(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1533,7 +1536,6 @@ proc_do_defense_mode(ctl_table *table, int write, return rc; } - static int proc_do_sync_threshold(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1767,6 +1769,7 @@ const struct ctl_path net_vs_ctl_path[] = { { } }; EXPORT_SYMBOL_GPL(net_vs_ctl_path); +#endif #ifdef CONFIG_PROC_FS @@ -3511,7 +3514,8 @@ static void ip_vs_genl_unregister(void) /* * per netns intit/exit func. */ -int __net_init __ip_vs_control_init(struct net *net) +#ifdef CONFIG_SYSCTL +int __net_init __ip_vs_control_init_sysctl(struct net *net) { int idx; struct netns_ipvs *ipvs = net_ipvs(net); @@ -3521,33 +3525,11 @@ int __net_init __ip_vs_control_init(struct net *net) spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); - ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); - - /* Initialize rs_table */ - for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) - INIT_LIST_HEAD(&ipvs->rs_table[idx]); - - INIT_LIST_HEAD(&ipvs->dest_trash); - atomic_set(&ipvs->ftpsvc_counter, 0); - atomic_set(&ipvs->nullsvc_counter, 0); - - /* procfs stats */ - ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats.cpustats) { - pr_err("%s() alloc_percpu failed\n", __func__); - goto err_alloc; - } - spin_lock_init(&ipvs->tot_stats.lock); - - proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); - proc_net_fops_create(net, "ip_vs_stats_percpu", 0, - &ip_vs_stats_percpu_fops); if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) - goto err_dup; + return -ENOMEM; } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -3576,25 +3558,73 @@ int __net_init __ip_vs_control_init(struct net *net) tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; -#ifdef CONFIG_SYSCTL ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, tbl); if (ipvs->sysctl_hdr == NULL) { if (!net_eq(net, &init_net)) kfree(tbl); - goto err_dup; + return -ENOMEM; } -#endif ip_vs_start_estimator(net, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + + return 0; +} + +void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); +} + +#else + +int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; } +void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } + +#endif + +int __net_init __ip_vs_control_init(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + + ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + + /* procfs stats */ + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (ipvs->tot_stats.cpustats) { + pr_err("%s(): alloc_percpu.\n", __func__); + return -ENOMEM; + } + spin_lock_init(&ipvs->tot_stats.lock); + + proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); + proc_net_fops_create(net, "ip_vs_stats_percpu", 0, + &ip_vs_stats_percpu_fops); + + if (__ip_vs_control_init_sysctl(net)) + goto err; + return 0; -err_dup: +err: free_percpu(ipvs->tot_stats.cpustats); -err_alloc: return -ENOMEM; } @@ -3604,11 +3634,7 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) ip_vs_trash_cleanup(net); ip_vs_stop_estimator(net, &ipvs->tot_stats); - cancel_delayed_work_sync(&ipvs->defense_work); - cancel_work_sync(&ipvs->defense_work.work); -#ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(ipvs->sysctl_hdr); -#endif + __ip_vs_control_cleanup_sysctl(net); proc_net_remove(net, "ip_vs_stats_percpu"); proc_net_remove(net, "ip_vs_stats"); proc_net_remove(net, "ip_vs"); From f2247fbdc41372d64c89505280419ceb45d80a31 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 4 Feb 2011 18:33:02 +0900 Subject: [PATCH 28/37] IPVS: Conditionally include sysctl members of struct netns_ipvs There is now no need to include sysctl members of struct netns_ipvs unless CONFIG_SYSCTL is defined. Signed-off-by: Simon Horman --- include/net/ip_vs.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 299aeb537899..272f59336b73 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -839,15 +839,7 @@ struct netns_ipvs { struct ip_vs_stats tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ - /* 1/rate drop and drop-entry variables */ - struct delayed_work defense_work; /* Work handler */ - int drop_rate; - int drop_counter; - atomic_t dropentry; - /* locks in ctl.c */ - spinlock_t dropentry_lock; /* drop entry handling */ - spinlock_t droppacket_lock; /* drop packet handling */ - spinlock_t securetcp_lock; /* state and timeout tables */ + rwlock_t rs_lock; /* real services table */ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ struct lock_class_key ctl_key; /* ctl_mutex debuging */ @@ -857,9 +849,22 @@ struct netns_ipvs { atomic_t ftpsvc_counter; atomic_t nullsvc_counter; +#ifdef CONFIG_SYSCTL + /* 1/rate drop and drop-entry variables */ + struct delayed_work defense_work; /* Work handler */ + int drop_rate; + int drop_counter; + atomic_t dropentry; + /* locks in ctl.c */ + spinlock_t dropentry_lock; /* drop entry handling */ + spinlock_t droppacket_lock; /* drop packet handling */ + spinlock_t securetcp_lock; /* state and timeout tables */ + /* sys-ctl struct */ struct ctl_table_header *sysctl_hdr; struct ctl_table *sysctl_tbl; +#endif + /* sysctl variables */ int sysctl_amemthresh; int sysctl_am_droprate; From 8183e3a88aced228ab9770762692be6cc3786e80 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 15 Mar 2011 13:23:28 +0100 Subject: [PATCH 29/37] netfilter: xt_connlimit: fix daddr connlimit in SNAT scenario We use the reply tuples when limiting the connections by the destination addresses, however, in SNAT scenario, the final reply tuples won't be ready until SNAT is done in POSTROUING or INPUT chain, and the following nf_conntrack_find_get() in count_tem() will get nothing, so connlimit can't work as expected. In this patch, the original tuples are always used, and an additional member addr is appended to save the address in either end. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/xt_connlimit.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index e029c4807404..1f4b9f9da496 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -33,8 +33,9 @@ /* we will save the tuples of all connections we care about */ struct xt_connlimit_conn { - struct list_head list; - struct nf_conntrack_tuple tuple; + struct list_head list; + struct nf_conntrack_tuple tuple; + union nf_inet_addr addr; }; struct xt_connlimit_data { @@ -151,7 +152,7 @@ static int count_them(struct net *net, continue; } - if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) + if (same_source_net(addr, mask, &conn->addr, family)) /* same source network -> be counted! */ ++matches; nf_ct_put(found_ct); @@ -165,6 +166,7 @@ static int count_them(struct net *net, if (conn == NULL) return -ENOMEM; conn->tuple = *tuple; + conn->addr = *addr; list_add(&conn->list, hash); ++matches; } @@ -185,15 +187,11 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) int connections; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - if (info->flags & XT_CONNLIMIT_DADDR) - tuple_ptr = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - else - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - par->family, &tuple)) { + if (ct != NULL) + tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + par->family, &tuple)) goto hotdrop; - } if (par->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); From 0e23ca14f8e76091b402c01e2b169aba3d187b98 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 15 Mar 2011 13:24:56 +0100 Subject: [PATCH 30/37] netfilter: xt_connlimit: use kmalloc() instead of kzalloc() All the members are initialized after kzalloc(). Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/xt_connlimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 1f4b9f9da496..ade2a806a48e 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -162,7 +162,7 @@ static int count_them(struct net *net, if (addit) { /* save the new connection in our list */ - conn = kzalloc(sizeof(*conn), GFP_ATOMIC); + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (conn == NULL) return -ENOMEM; conn->tuple = *tuple; From 3e0d5149e6dcbe7111a63773a07c5b33f7ca7236 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 15 Mar 2011 13:25:42 +0100 Subject: [PATCH 31/37] netfilter: xt_connlimit: use hlist instead The header of hlist is smaller than list. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/xt_connlimit.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index ade2a806a48e..da56d6ec13c9 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -33,14 +33,14 @@ /* we will save the tuples of all connections we care about */ struct xt_connlimit_conn { - struct list_head list; + struct hlist_node node; struct nf_conntrack_tuple tuple; union nf_inet_addr addr; }; struct xt_connlimit_data { - struct list_head iphash[256]; - spinlock_t lock; + struct hlist_head iphash[256]; + spinlock_t lock; }; static u_int32_t connlimit_rnd __read_mostly; @@ -102,9 +102,9 @@ static int count_them(struct net *net, { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; - struct xt_connlimit_conn *tmp; + struct hlist_node *pos, *n; struct nf_conn *found_ct; - struct list_head *hash; + struct hlist_head *hash; bool addit = true; int matches = 0; @@ -116,7 +116,7 @@ static int count_them(struct net *net, rcu_read_lock(); /* check the saved connections */ - list_for_each_entry_safe(conn, tmp, hash, list) { + hlist_for_each_entry_safe(conn, pos, n, hash, node) { found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, &conn->tuple); found_ct = NULL; @@ -136,7 +136,7 @@ static int count_them(struct net *net, if (found == NULL) { /* this one is gone */ - list_del(&conn->list); + hlist_del(&conn->node); kfree(conn); continue; } @@ -147,7 +147,7 @@ static int count_them(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - list_del(&conn->list); + hlist_del(&conn->node); kfree(conn); continue; } @@ -167,7 +167,7 @@ static int count_them(struct net *net, return -ENOMEM; conn->tuple = *tuple; conn->addr = *addr; - list_add(&conn->list, hash); + hlist_add_head(&conn->node, hash); ++matches; } @@ -246,7 +246,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) spin_lock_init(&info->data->lock); for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) - INIT_LIST_HEAD(&info->data->iphash[i]); + INIT_HLIST_HEAD(&info->data->iphash[i]); return 0; } @@ -255,15 +255,15 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; struct xt_connlimit_conn *conn; - struct xt_connlimit_conn *tmp; - struct list_head *hash = info->data->iphash; + struct hlist_node *pos, *n; + struct hlist_head *hash = info->data->iphash; unsigned int i; nf_ct_l3proto_module_put(par->family); for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { - list_for_each_entry_safe(conn, tmp, &hash[i], list) { - list_del(&conn->list); + hlist_for_each_entry_safe(conn, pos, n, &hash[i], node) { + hlist_del(&conn->node); kfree(conn); } } From 4656c4d61adb8dc3ee04c08f57a5cc7598814420 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 15 Mar 2011 13:26:32 +0100 Subject: [PATCH 32/37] netfilter: xt_connlimit: remove connlimit_rnd_inited A potential race condition when generating connlimit_rnd is also fixed. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/xt_connlimit.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index da56d6ec13c9..c6d5a83450c9 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -44,7 +44,6 @@ struct xt_connlimit_data { }; static u_int32_t connlimit_rnd __read_mostly; -static bool connlimit_rnd_inited __read_mostly; static inline unsigned int connlimit_iphash(__be32 addr) { @@ -226,9 +225,13 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) unsigned int i; int ret; - if (unlikely(!connlimit_rnd_inited)) { - get_random_bytes(&connlimit_rnd, sizeof(connlimit_rnd)); - connlimit_rnd_inited = true; + if (unlikely(!connlimit_rnd)) { + u_int32_t rand; + + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&connlimit_rnd, 0, rand); } ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) { From 42eab94fff18cb1091d3501cd284d6bd6cc9c143 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 15 Mar 2011 13:35:21 +0100 Subject: [PATCH 33/37] netfilter: arp_tables: fix infoleak to userspace Structures ipt_replace, compat_ipt_replace, and xt_get_revision are copied from userspace. Fields of these structs that are zero-terminated strings are not checked. When they are used as argument to a format string containing "%s" in request_module(), some sensitive information is leaked to userspace via argument of spawned modprobe process. The first bug was introduced before the git epoch; the second is introduced by 6b7d31fc (v2.6.15-rc1); the third is introduced by 6b7d31fc (v2.6.15-rc1). To trigger the bug one should have CAP_NET_ADMIN. Signed-off-by: Vasiliy Kulikov Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/arp_tables.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e95054c690c6..4b5d457c2d76 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1066,6 +1066,7 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1488,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1740,6 +1742,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), From 78b79876761b86653df89c48a7010b5cbd41a84a Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 15 Mar 2011 13:36:05 +0100 Subject: [PATCH 34/37] netfilter: ip_tables: fix infoleak to userspace Structures ipt_replace, compat_ipt_replace, and xt_get_revision are copied from userspace. Fields of these structs that are zero-terminated strings are not checked. When they are used as argument to a format string containing "%s" in request_module(), some sensitive information is leaked to userspace via argument of spawned modprobe process. The first and the third bugs were introduced before the git epoch; the second was introduced in 2722971c (v2.6.17-rc1). To trigger the bug one should have CAP_NET_ADMIN. Signed-off-by: Vasiliy Kulikov Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_tables.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ef7d7b9680ea..b09ed0d080f9 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1262,6 +1262,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1807,6 +1808,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -2036,6 +2038,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IPT_SO_GET_REVISION_TARGET) target = 1; From 6a8ab060779779de8aea92ce3337ca348f973f54 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 15 Mar 2011 13:37:13 +0100 Subject: [PATCH 35/37] ipv6: netfilter: ip6_tables: fix infoleak to userspace Structures ip6t_replace, compat_ip6t_replace, and xt_get_revision are copied from userspace. Fields of these structs that are zero-terminated strings are not checked. When they are used as argument to a format string containing "%s" in request_module(), some sensitive information is leaked to userspace via argument of spawned modprobe process. The first bug was introduced before the git epoch; the second was introduced in 3bc3fe5e (v2.6.25-rc1); the third is introduced by 6b7d31fc (v2.6.15-rc1). To trigger the bug one should have CAP_NET_ADMIN. Signed-off-by: Vasiliy Kulikov Signed-off-by: Patrick McHardy --- net/ipv6/netfilter/ip6_tables.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 47b7b8df7fac..c9598a9067d7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1275,6 +1275,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1822,6 +1823,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -2051,6 +2053,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IP6T_SO_GET_REVISION_TARGET) target = 1; From de81bbea17650769882bc625d6b5df11ee7c4b24 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Mar 2011 20:16:20 +0100 Subject: [PATCH 36/37] netfilter: ipt_addrtype: rename to xt_addrtype Followup patch will add ipv6 support. ipt_addrtype.h is retained for compatibility reasons, but no longer used by the kernel. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- Documentation/feature-removal-schedule.txt | 8 +++++ include/linux/netfilter/Kbuild | 1 + include/linux/netfilter/xt_addrtype.h | 27 ++++++++++++++++ net/ipv4/netfilter/Kconfig | 10 ------ net/ipv4/netfilter/Makefile | 1 - net/netfilter/Kconfig | 10 ++++++ net/netfilter/Makefile | 1 + .../xt_addrtype.c} | 31 ++++++++++--------- 8 files changed, 63 insertions(+), 26 deletions(-) create mode 100644 include/linux/netfilter/xt_addrtype.h rename net/{ipv4/netfilter/ipt_addrtype.c => netfilter/xt_addrtype.c} (79%) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 05b248aa91f1..a7ee7cf5009e 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -618,3 +618,11 @@ Who: Jan Engelhardt Files: net/netfilter/xt_connlimit.c ---------------------------- + +What: ipt_addrtype match include file +When: 2012 +Why: superseded by xt_addrtype +Who: Florian Westphal +Files: include/linux/netfilter_ipv4/ipt_addrtype.h + +---------------------------- diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 15e83bf3dd58..a1b410c76fc3 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -29,6 +29,7 @@ header-y += xt_TCPMSS.h header-y += xt_TCPOPTSTRIP.h header-y += xt_TEE.h header-y += xt_TPROXY.h +header-y += xt_addrtype.h header-y += xt_cluster.h header-y += xt_comment.h header-y += xt_connbytes.h diff --git a/include/linux/netfilter/xt_addrtype.h b/include/linux/netfilter/xt_addrtype.h new file mode 100644 index 000000000000..b492fc84f737 --- /dev/null +++ b/include/linux/netfilter/xt_addrtype.h @@ -0,0 +1,27 @@ +#ifndef _XT_ADDRTYPE_H +#define _XT_ADDRTYPE_H + +#include + +enum { + XT_ADDRTYPE_INVERT_SOURCE = 0x0001, + XT_ADDRTYPE_INVERT_DEST = 0x0002, + XT_ADDRTYPE_LIMIT_IFACE_IN = 0x0004, + XT_ADDRTYPE_LIMIT_IFACE_OUT = 0x0008, +}; + +struct xt_addrtype_info_v1 { + __u16 source; /* source-type mask */ + __u16 dest; /* dest-type mask */ + __u32 flags; +}; + +/* revision 0 */ +struct xt_addrtype_info { + __u16 source; /* source-type mask */ + __u16 dest; /* dest-type mask */ + __u32 invert_source; + __u32 invert_dest; +}; + +#endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f926a310075d..1dfc18a03fd4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -64,16 +64,6 @@ config IP_NF_IPTABLES if IP_NF_IPTABLES # The matches. -config IP_NF_MATCH_ADDRTYPE - tristate '"addrtype" address type match support' - depends on NETFILTER_ADVANCED - help - This option allows you to match what routing thinks of an address, - eg. UNICAST, LOCAL, BROADCAST, ... - - If you want to compile it as a module, say M here and read - . If unsure, say `N'. - config IP_NF_MATCH_AH tristate '"ah" match support' depends on NETFILTER_ADVANCED diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19eb59d01037..dca2082ec683 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 82a6e0d80f05..32bff6d86cb2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -649,6 +649,16 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP comment "Xtables matches" +config NETFILTER_XT_MATCH_ADDRTYPE + tristate '"addrtype" address type match support' + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config NETFILTER_XT_MATCH_CLUSTER tristate '"cluster" match support' depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index d57a890eaee5..1a02853df863 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o # matches +obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/netfilter/xt_addrtype.c similarity index 79% rename from net/ipv4/netfilter/ipt_addrtype.c rename to net/netfilter/xt_addrtype.c index db8bff0fb86d..e89c0b84583c 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -16,12 +16,13 @@ #include #include -#include +#include #include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); -MODULE_DESCRIPTION("Xtables: address type match for IPv4"); +MODULE_DESCRIPTION("Xtables: address type match"); +MODULE_ALIAS("ipt_addrtype"); static inline bool match_type(struct net *net, const struct net_device *dev, __be32 addr, u_int16_t mask) @@ -33,7 +34,7 @@ static bool addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info *info = par->matchinfo; + const struct xt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; @@ -51,31 +52,31 @@ static bool addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info_v1 *info = par->matchinfo; + const struct xt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); const struct net_device *dev = NULL; bool ret = true; - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) dev = par->in; - else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) + else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) dev = par->out; if (info->source) ret &= match_type(net, dev, iph->saddr, info->source) ^ - (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); if (ret && info->dest) ret &= match_type(net, dev, iph->daddr, info->dest) ^ - !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); return ret; } static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { - struct ipt_addrtype_info_v1 *info = par->matchinfo; + struct xt_addrtype_info_v1 *info = par->matchinfo; - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { pr_info("both incoming and outgoing " "interface limitation cannot be selected\n"); return -EINVAL; @@ -83,7 +84,7 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { pr_info("output interface limitation " "not valid in PREROUTING and INPUT\n"); return -EINVAL; @@ -91,7 +92,7 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { + info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { pr_info("input interface limitation " "not valid in POSTROUTING and OUTPUT\n"); return -EINVAL; @@ -105,7 +106,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .name = "addrtype", .family = NFPROTO_IPV4, .match = addrtype_mt_v0, - .matchsize = sizeof(struct ipt_addrtype_info), + .matchsize = sizeof(struct xt_addrtype_info), .me = THIS_MODULE }, { @@ -114,7 +115,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, - .matchsize = sizeof(struct ipt_addrtype_info_v1), + .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE } }; From 2f5dc63123905a89d4260ab8ee08d19ec104db04 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Mar 2011 20:17:44 +0100 Subject: [PATCH 37/37] netfilter: xt_addrtype: ipv6 support The kernel will refuse certain types that do not work in ipv6 mode. We can then add these features incrementally without risk of userspace breakage. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- include/linux/netfilter/xt_addrtype.h | 17 +++++ net/netfilter/Kconfig | 1 + net/netfilter/xt_addrtype.c | 98 ++++++++++++++++++++++++++- 3 files changed, 114 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/xt_addrtype.h b/include/linux/netfilter/xt_addrtype.h index b492fc84f737..b156baa9d55e 100644 --- a/include/linux/netfilter/xt_addrtype.h +++ b/include/linux/netfilter/xt_addrtype.h @@ -10,6 +10,23 @@ enum { XT_ADDRTYPE_LIMIT_IFACE_OUT = 0x0008, }; + +/* rtn_type enum values from rtnetlink.h, but shifted */ +enum { + XT_ADDRTYPE_UNSPEC = 1 << 0, + XT_ADDRTYPE_UNICAST = 1 << 1, /* 1 << RTN_UNICAST */ + XT_ADDRTYPE_LOCAL = 1 << 2, /* 1 << RTN_LOCAL, etc */ + XT_ADDRTYPE_BROADCAST = 1 << 3, + XT_ADDRTYPE_ANYCAST = 1 << 4, + XT_ADDRTYPE_MULTICAST = 1 << 5, + XT_ADDRTYPE_BLACKHOLE = 1 << 6, + XT_ADDRTYPE_UNREACHABLE = 1 << 7, + XT_ADDRTYPE_PROHIBIT = 1 << 8, + XT_ADDRTYPE_THROW = 1 << 9, + XT_ADDRTYPE_NAT = 1 << 10, + XT_ADDRTYPE_XRESOLVE = 1 << 11, +}; + struct xt_addrtype_info_v1 { __u16 source; /* source-type mask */ __u16 dest; /* dest-type mask */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 32bff6d86cb2..c3f988aa1152 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -652,6 +652,7 @@ comment "Xtables matches" config NETFILTER_XT_MATCH_ADDRTYPE tristate '"addrtype" address type match support' depends on NETFILTER_ADVANCED + depends on (IPV6 || IPV6=n) ---help--- This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index e89c0b84583c..2220b85e9519 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -16,6 +16,12 @@ #include #include +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#include +#include +#include +#endif + #include #include @@ -23,6 +29,73 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_DESCRIPTION("Xtables: address type match"); MODULE_ALIAS("ipt_addrtype"); +MODULE_ALIAS("ip6t_addrtype"); + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +static u32 xt_addrtype_rt6_to_type(const struct rt6_info *rt) +{ + u32 ret; + + if (!rt) + return XT_ADDRTYPE_UNREACHABLE; + + if (rt->rt6i_flags & RTF_REJECT) + ret = XT_ADDRTYPE_UNREACHABLE; + else + ret = 0; + + if (rt->rt6i_flags & RTF_LOCAL) + ret |= XT_ADDRTYPE_LOCAL; + if (rt->rt6i_flags & RTF_ANYCAST) + ret |= XT_ADDRTYPE_ANYCAST; + return ret; +} + +static bool match_type6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr, u16 mask) +{ + int addr_type = ipv6_addr_type(addr); + + if ((mask & XT_ADDRTYPE_MULTICAST) && + !(addr_type & IPV6_ADDR_MULTICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) + return false; + + if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | + XT_ADDRTYPE_UNREACHABLE) & mask) { + struct rt6_info *rt; + u32 type; + int ifindex = dev ? dev->ifindex : 0; + + rt = rt6_lookup(net, addr, NULL, ifindex, !!dev); + + type = xt_addrtype_rt6_to_type(rt); + + dst_release(&rt->dst); + return !!(mask & type); + } + return true; +} + +static bool +addrtype_mt6(struct net *net, const struct net_device *dev, + const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type6(net, dev, &iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} +#endif static inline bool match_type(struct net *net, const struct net_device *dev, __be32 addr, u_int16_t mask) @@ -53,7 +126,7 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = dev_net(par->in ? par->in : par->out); const struct xt_addrtype_info_v1 *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; const struct net_device *dev = NULL; bool ret = true; @@ -62,6 +135,11 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) dev = par->out; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + if (par->family == NFPROTO_IPV6) + return addrtype_mt6(net, dev, skb, info); +#endif + iph = ip_hdr(skb); if (info->source) ret &= match_type(net, dev, iph->saddr, info->source) ^ (info->flags & XT_ADDRTYPE_INVERT_SOURCE); @@ -98,6 +176,22 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) return -EINVAL; } +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + if (par->family == NFPROTO_IPV6) { + if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { + pr_err("ipv6 BLACKHOLE matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { + pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { + pr_err("ipv6 does not support BROADCAST matching\n"); + return -EINVAL; + } + } +#endif return 0; } @@ -111,7 +205,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { }, { .name = "addrtype", - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1,