Commit 2fa7cc9c authored by Julian Anastasov's avatar Julian Anastasov Committed by Florian Westphal
Browse files

ipvs: switch to per-net connection table



Use per-net resizable hash table for connections. The global table
is slow to walk when using many namespaces.

The table can be resized in the range of [256 - ip_vs_conn_tab_size].
Table is attached only while services are present. Resizing is done
by delayed work based on load (the number of connections).

Add a hash_key field into the connection to store the table ID in
the highest bit and the entry's hash value in the lowest bits. The
lowest part of the hash value is used as bucket ID, the remaining
part is used to filter the entries in the bucket before matching
the keys and as result, helps the lookup operation to access only
one cache line. By knowing the table ID and bucket ID for entry,
we can unlink it without calculating the hash value and doing
lookup by keys. We need only to validate the saved hash_key under
lock.

For better security switch from jhash to siphash for the default
connection hashing but the persistence engines may use their own
function. Keeping the hash table loaded with entries below the
size (12%) allows to avoid collision for 96+% of the conns.

ip_vs_conn_fill_cport() now will rehash the connection with proper
locking because unhash+hash is not safe for RCU readers.

To invalidate the templates setting just dport to 0xffff is enough,
no need to rehash them. As result, ip_vs_conn_unhash() is now
unused and removed.

Signed-off-by: default avatarJulian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
parent 840aac3d
Loading
Loading
Loading
Loading
+29 −5
Original line number Diff line number Diff line
@@ -36,6 +36,14 @@
#define IP_VS_HDR_INVERSE	1
#define IP_VS_HDR_ICMP		2

/* conn_tab limits (as per Kconfig) */
#define IP_VS_CONN_TAB_MIN_BITS	8
#if BITS_PER_LONG > 32
#define IP_VS_CONN_TAB_MAX_BITS	27
#else
#define IP_VS_CONN_TAB_MAX_BITS	20
#endif

/* svc_table limits */
#define IP_VS_SVC_TAB_MIN_BITS	4
#define IP_VS_SVC_TAB_MAX_BITS	20
@@ -289,6 +297,7 @@ static inline int ip_vs_af_index(int af)
enum {
	IP_VS_WORK_SVC_RESIZE,		/* Schedule svc_resize_work */
	IP_VS_WORK_SVC_NORESIZE,	/* Stopping svc_resize_work */
	IP_VS_WORK_CONN_RESIZE,		/* Schedule conn_resize_work */
};

/* The port number of FTP service (in network order). */
@@ -779,18 +788,19 @@ struct ip_vs_conn_param {

/* IP_VS structure allocated for each dynamically scheduled connection */
struct ip_vs_conn {
	struct hlist_node	c_list;         /* hashed list heads */
	struct hlist_bl_node	c_list;         /* node in conn_tab */
	__u32			hash_key;	/* Key for the hash table */
	/* Protocol, addresses and port numbers */
	__be16                  cport;
	__be16                  dport;
	__be16                  vport;
	u16			af;		/* address family */
	__u16                   protocol;       /* Which protocol (TCP/UDP) */
	__u16			daf;		/* Address family of the dest */
	union nf_inet_addr      caddr;          /* client address */
	union nf_inet_addr      vaddr;          /* virtual address */
	union nf_inet_addr      daddr;          /* destination address */
	volatile __u32          flags;          /* status flags */
	__u16                   protocol;       /* Which protocol (TCP/UDP) */
	__u16			daf;		/* Address family of the dest */
	struct netns_ipvs	*ipvs;

	/* counter and timer */
@@ -1009,8 +1019,8 @@ struct ip_vs_pe {
	int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
	bool (*ct_match)(const struct ip_vs_conn_param *p,
			 struct ip_vs_conn *ct);
	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
			   bool inverse);
	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p,
			   struct ip_vs_rht *t, bool inverse);
	int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
	/* create connections for real-server outgoing packets */
	struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
@@ -1150,6 +1160,7 @@ struct netns_ipvs {
	/* ip_vs_conn */
	atomic_t		conn_count;      /* connection counter */
	atomic_t		no_cport_conns[IP_VS_AF_MAX];
	struct delayed_work	conn_resize_work;/* resize conn_tab */

	/* ip_vs_ctl */
	struct ip_vs_stats_rcu	*tot_stats;      /* Statistics & est. */
@@ -1226,6 +1237,7 @@ struct netns_ipvs {
	int			sysctl_est_nice;	/* kthread nice */
	int			est_stopped;		/* stop tasks */
#endif
	int			sysctl_conn_lfactor;
	int			sysctl_svc_lfactor;

	/* ip_vs_lblc */
@@ -1269,6 +1281,8 @@ struct netns_ipvs {
	unsigned int		hooks_afmask;	/* &1=AF_INET, &2=AF_INET6 */

	struct ip_vs_rht __rcu	*svc_table;	/* Services */
	struct ip_vs_rht __rcu	*conn_tab;	/* Connections */
	atomic_t		conn_tab_changes;/* ++ on new table */
};

#define DEFAULT_SYNC_THRESHOLD	3
@@ -1518,6 +1532,12 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)

#endif

/* Get load factor to map conn_count/u_thresh to t->size */
static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs)
{
	return READ_ONCE(ipvs->sysctl_conn_lfactor);
}

/* Get load factor to map num_services/u_thresh to t->size
 * Smaller value decreases u_thresh to reduce collisions but increases
 * the table size
@@ -1603,6 +1623,10 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
}
void ip_vs_conn_put(struct ip_vs_conn *cp);
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
			    int lfactor);
struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
				       int lfactor);

struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
				  const union nf_inet_addr *daddr,
+595 −254

File changed.

Preview size limit exceeded, changes collapsed.

+18 −0
Original line number Diff line number Diff line
@@ -1643,6 +1643,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
		  struct ip_vs_service **svc_p)
{
	struct ip_vs_scheduler *sched = NULL;
	struct ip_vs_rht *tc_new = NULL;
	struct ip_vs_rht *t, *t_new = NULL;
	int af_id = ip_vs_af_index(u->af);
	struct ip_vs_service *svc = NULL;
@@ -1702,6 +1703,17 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
		}
	}

	if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
		int lfactor = sysctl_conn_lfactor(ipvs);
		int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);

		tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
		if (!tc_new) {
			ret = -ENOMEM;
			goto out_err;
		}
	}

	if (!atomic_read(&ipvs->num_services[af_id])) {
		ret = ip_vs_register_hooks(ipvs, u->af);
		if (ret < 0)
@@ -1752,6 +1764,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
		rcu_assign_pointer(ipvs->svc_table, t_new);
		t_new = NULL;
	}
	if (tc_new) {
		rcu_assign_pointer(ipvs->conn_tab, tc_new);
		tc_new = NULL;
	}

	/* Update the virtual service counters */
	if (svc->port == FTPPORT)
@@ -1794,6 +1810,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,


 out_err:
	if (tc_new)
		ip_vs_rht_free(tc_new);
	if (t_new)
		ip_vs_rht_free(t_new);
	if (ret_hooks >= 0)
+2 −2
Original line number Diff line number Diff line
@@ -132,9 +132,9 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
}

static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
				 u32 initval, bool inverse)
				 struct ip_vs_rht *t, bool inverse)
{
	return jhash(p->pe_data, p->pe_data_len, initval);
	return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]);
}

static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+23 −0
Original line number Diff line number Diff line
@@ -1755,6 +1755,28 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
	if (!ip_vs_use_count_inc())
		return -ENOPROTOOPT;

	/* Backup server can be started without services just to sync conns,
	 * make sure conn_tab is created even if ipvs->enable is 0.
	 */
	if (state == IP_VS_STATE_BACKUP) {
		mutex_lock(&ipvs->service_mutex);
		if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
			int lfactor = sysctl_conn_lfactor(ipvs);
			int new_size = ip_vs_conn_desired_size(ipvs, NULL,
							       lfactor);
			struct ip_vs_rht *tc_new;

			tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
			if (!tc_new) {
				mutex_unlock(&ipvs->service_mutex);
				result = -ENOMEM;
				goto out_module;
			}
			rcu_assign_pointer(ipvs->conn_tab, tc_new);
		}
		mutex_unlock(&ipvs->service_mutex);
	}

	/* Do not hold one mutex and then to block on another */
	for (;;) {
		rtnl_lock();
@@ -1922,6 +1944,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
	mutex_unlock(&ipvs->sync_mutex);
	rtnl_unlock();

out_module:
	/* decrease the module use count */
	ip_vs_use_count_dec();
	return result;