Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next

Pablo Neira Ayuso says:

====================
Netfilter/IPVS updates for net-next

The following patchset contains Netfilter/IPVS updates for your net-next tree
in this 4.4 development cycle, they are:

1) Schedule ICMP traffic to IPVS instances, this introduces a new schedule_icmp
   proc knob to enable/disable it. By default is off to retain the old
   behaviour. Patchset from Alex Gartrell.

I'm also including what Alex originally said for the record:

"The configuration of ipvs at Facebook is relatively straightforward.  All
ipvs instances bgp advertise a set of VIPs and the network prefers the
nearest one or uses ECMP in the event of a tie.  For the uninitiated, ECMP
deterministically and statelessly load balances by hashing the packet
(usually a 5-tuple of protocol, saddr, daddr, sport, and dport) and using
that number as an index (basic hash table type logic).

The problem is that ICMP packets (which contain really important
information like whether or not an MTU has been exceeded) will get a
different hash value and may end up at a different ipvs instance.  With no
information about where to route these packets, they are dropped, creating
ICMP black holes and breaking Path MTU discovery.  Suddenly, my mom's
pictures can't load and I'm fielding midday calls that I want nothing to do
with.

To address this, this patch set introduces the ability to schedule icmp
packets which is gated by a sysctl net.ipv4.vs.schedule_icmp.  If set to 0,
the old behavior is maintained -- otherwise ICMP packets are scheduled."

2) Add another proc entry to ignore tunneled packets to avoid routing loops
   from IPVS, also from Alex.

3) Fifteen patches from Eric Biederman to:

* Stop passing nf_hook_ops as parameter to the hook and use the state hook
  object instead all around the netfilter code, so only the private data
  pointer is passed to the registered hook function.

* Now that we've got state->net, propagate the netns pointer to netfilter hook
  clients to avoid its computation over and over again. A good example of how
  this has been simplified is the former TEE target (now nf_dup infrastructure)
  since it has killed the ugly pick_net() function.

There's another round of netns updates from Eric Biederman making the line. To
avoid the patchbomb again to almost all the networking mailing list (that is 84
patches) I'd suggest we send you a pull request with no patches or let me know
if you prefer a better way.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller
2015-09-22 13:11:43 -07:00
120 changed files with 816 additions and 653 deletions
+10
View File
@@ -157,6 +157,16 @@ expire_quiescent_template - BOOLEAN
persistence template if it is to be used to schedule a new
connection and the destination server is quiescent.
ignore_tunneled - BOOLEAN
0 - disabled (default)
not 0 - enabled
If set, ipvs will set the ipvs_property on all packets which are of
unrecognized protocols. This prevents us from routing tunneled
protocols like ipip, which is useful to prevent rescheduling
packets that have been tunneled to the ipvs host (i.e. to prevent
ipvs routing loops when ipvs is also acting as a real server).
nat_icmp_send - BOOLEAN
0 - disabled (default)
not 0 - enabled
+1 -1
View File
@@ -80,7 +80,7 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
p->okfn = okfn;
}
typedef unsigned int nf_hookfn(const struct nf_hook_ops *ops,
typedef unsigned int nf_hookfn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state);
+2 -1
View File
@@ -13,6 +13,7 @@
* @target: the target extension
* @matchinfo: per-match data
* @targetinfo: per-target data
* @net network namespace through which the action was invoked
* @in: input netdevice
* @out: output netdevice
* @fragoff: packet is a fragment, this is the data offset
@@ -24,7 +25,6 @@
* Fields written to by extensions:
*
* @hotdrop: drop packet if we had inspection problems
* Network namespace obtainable using dev_net(in/out)
*/
struct xt_action_param {
union {
@@ -34,6 +34,7 @@ struct xt_action_param {
union {
const void *matchinfo, *targinfo;
};
struct net *net;
const struct net_device *in, *out;
int fragoff;
unsigned int thoff;
-1
View File
@@ -53,7 +53,6 @@ extern struct xt_table *arpt_register_table(struct net *net,
const struct arpt_replace *repl);
extern void arpt_unregister_table(struct xt_table *table);
extern unsigned int arpt_do_table(struct sk_buff *skb,
unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table);
+3 -3
View File
@@ -111,9 +111,9 @@ struct ebt_table {
extern struct ebt_table *ebt_register_table(struct net *net,
const struct ebt_table *table);
extern void ebt_unregister_table(struct net *net, struct ebt_table *table);
extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
struct ebt_table *table);
extern unsigned int ebt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct ebt_table *table);
/* Used in the kernel match() functions */
#define FWINV(bool,invflg) ((bool) ^ !!(info->invflags & invflg))
-1
View File
@@ -64,7 +64,6 @@ struct ipt_error {
extern void *ipt_alloc_initial_table(const struct xt_table *);
extern unsigned int ipt_do_table(struct sk_buff *skb,
unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table);
@@ -30,7 +30,6 @@ extern struct xt_table *ip6t_register_table(struct net *net,
const struct ip6t_replace *repl);
extern void ip6t_unregister_table(struct net *net, struct xt_table *table);
extern unsigned int ip6t_do_table(struct sk_buff *skb,
unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table);
+92 -28
View File
@@ -29,6 +29,9 @@
#endif
#include <net/net_namespace.h> /* Netw namespace */
#define IP_VS_HDR_INVERSE 1
#define IP_VS_HDR_ICMP 2
/* Generic access of ipvs struct */
static inline struct netns_ipvs *net_ipvs(struct net* net)
{
@@ -104,6 +107,8 @@ static inline struct net *seq_file_single_net(struct seq_file *seq)
extern int ip_vs_conn_tab_size;
struct ip_vs_iphdr {
int hdr_flags; /* ipvs flags */
__u32 off; /* Where IP or IPv4 header starts */
__u32 len; /* IPv4 simply where L4 starts
* IPv6 where L4 Transport Header starts */
__u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/
@@ -120,48 +125,89 @@ static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
return skb_header_pointer(skb, offset, len, buffer);
}
static inline void
ip_vs_fill_ip4hdr(const void *nh, struct ip_vs_iphdr *iphdr)
{
const struct iphdr *iph = nh;
iphdr->len = iph->ihl * 4;
iphdr->fragoffs = 0;
iphdr->protocol = iph->protocol;
iphdr->saddr.ip = iph->saddr;
iphdr->daddr.ip = iph->daddr;
}
/* This function handles filling *ip_vs_iphdr, both for IPv4 and IPv6.
* IPv6 requires some extra work, as finding proper header position,
* depend on the IPv6 extension headers.
*/
static inline void
ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr)
static inline int
ip_vs_fill_iph_skb_off(int af, const struct sk_buff *skb, int offset,
int hdr_flags, struct ip_vs_iphdr *iphdr)
{
iphdr->hdr_flags = hdr_flags;
iphdr->off = offset;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
const struct ipv6hdr *iph =
(struct ipv6hdr *)skb_network_header(skb);
struct ipv6hdr _iph;
const struct ipv6hdr *iph = skb_header_pointer(
skb, offset, sizeof(_iph), &_iph);
if (!iph)
return 0;
iphdr->saddr.in6 = iph->saddr;
iphdr->daddr.in6 = iph->daddr;
/* ipv6_find_hdr() updates len, flags */
iphdr->len = 0;
iphdr->len = offset;
iphdr->flags = 0;
iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1,
&iphdr->fragoffs,
&iphdr->flags);
if (iphdr->protocol < 0)
return 0;
} else
#endif
{
const struct iphdr *iph =
(struct iphdr *)skb_network_header(skb);
iphdr->len = iph->ihl * 4;
struct iphdr _iph;
const struct iphdr *iph = skb_header_pointer(
skb, offset, sizeof(_iph), &_iph);
if (!iph)
return 0;
iphdr->len = offset + iph->ihl * 4;
iphdr->fragoffs = 0;
iphdr->protocol = iph->protocol;
iphdr->saddr.ip = iph->saddr;
iphdr->daddr.ip = iph->daddr;
}
return 1;
}
static inline int
ip_vs_fill_iph_skb_icmp(int af, const struct sk_buff *skb, int offset,
bool inverse, struct ip_vs_iphdr *iphdr)
{
int hdr_flags = IP_VS_HDR_ICMP;
if (inverse)
hdr_flags |= IP_VS_HDR_INVERSE;
return ip_vs_fill_iph_skb_off(af, skb, offset, hdr_flags, iphdr);
}
static inline int
ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, bool inverse,
struct ip_vs_iphdr *iphdr)
{
int hdr_flags = 0;
if (inverse)
hdr_flags |= IP_VS_HDR_INVERSE;
return ip_vs_fill_iph_skb_off(af, skb, skb_network_offset(skb),
hdr_flags, iphdr);
}
static inline bool
ip_vs_iph_inverse(const struct ip_vs_iphdr *iph)
{
return !!(iph->hdr_flags & IP_VS_HDR_INVERSE);
}
static inline bool
ip_vs_iph_icmp(const struct ip_vs_iphdr *iph)
{
return !!(iph->hdr_flags & IP_VS_HDR_ICMP);
}
static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst,
@@ -449,14 +495,12 @@ struct ip_vs_protocol {
struct ip_vs_conn *
(*conn_in_get)(int af,
const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
int inverse);
const struct ip_vs_iphdr *iph);
struct ip_vs_conn *
(*conn_out_get)(int af,
const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
int inverse);
const struct ip_vs_iphdr *iph);
int (*snat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, struct ip_vs_iphdr *iph);
@@ -953,6 +997,8 @@ struct netns_ipvs {
int sysctl_pmtu_disc;
int sysctl_backup_only;
int sysctl_conn_reuse_mode;
int sysctl_schedule_icmp;
int sysctl_ignore_tunneled;
/* ip_vs_lblc */
int sysctl_lblc_expiration;
@@ -1071,6 +1117,16 @@ static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
return ipvs->sysctl_conn_reuse_mode;
}
static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_schedule_icmp;
}
static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_ignore_tunneled;
}
#else
static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1143,6 +1199,16 @@ static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
return 1;
}
static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs)
{
return 0;
}
static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs)
{
return 0;
}
#endif
/* IPVS core functions
@@ -1186,14 +1252,12 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p);
struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p);
struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
int inverse);
const struct ip_vs_iphdr *iph);
struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p);
struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
int inverse);
const struct ip_vs_iphdr *iph);
/* Get reference to gain full access to conn.
* By default, RCU read-side critical sections have access only to
+1 -1
View File
@@ -46,7 +46,7 @@ void br_netfilter_enable(void);
#if IS_ENABLED(CONFIG_IPV6)
int br_validate_ipv6(struct sk_buff *skb);
unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
unsigned int br_nf_pre_routing_ipv6(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state);
#else
+1 -1
View File
@@ -1,7 +1,7 @@
#ifndef _NF_DUP_IPV4_H_
#define _NF_DUP_IPV4_H_
void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in_addr *gw, int oif);
#endif /* _NF_DUP_IPV4_H_ */
+1 -1
View File
@@ -1,7 +1,7 @@
#ifndef _NF_DUP_IPV6_H_
#define _NF_DUP_IPV6_H_
void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum,
void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in6_addr *gw, int oif);
#endif /* _NF_DUP_IPV6_H_ */
+2 -1
View File
@@ -191,7 +191,8 @@ int nf_conntrack_hash_check_insert(struct nf_conn *ct);
bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report);
bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
u_int16_t l3num, struct nf_conntrack_tuple *tuple);
u_int16_t l3num, struct net *net,
struct nf_conntrack_tuple *tuple);
bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig);
@@ -41,6 +41,7 @@ void nf_conntrack_cleanup_end(void);
bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff,
unsigned int dataoff, u_int16_t l3num, u_int8_t protonum,
struct net *net,
struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto);
+1 -1
View File
@@ -26,7 +26,7 @@ struct nf_conntrack_l4proto {
/* Try to fill in the third arg: dataoff is offset past network protocol
hdr. Return true if possible. */
bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
struct nf_conntrack_tuple *tuple);
struct net *net, struct nf_conntrack_tuple *tuple);
/* Invert the per-proto part of the tuple: ie. turn xmit into reply.
* Some packets can't be inverted: return 0 in that case.
+1 -1
View File
@@ -10,7 +10,7 @@
unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
unsigned int hooknum, struct sk_buff *skb);
int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family);
int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family);
static inline int nf_nat_initialized(struct nf_conn *ct,
enum nf_nat_manip_type manip)
+16 -16
View File
@@ -43,31 +43,31 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum);
unsigned int nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
unsigned int nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
unsigned int nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
unsigned int nf_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
unsigned int nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
@@ -76,31 +76,31 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum, unsigned int hdrlen);
unsigned int nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
unsigned int nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
unsigned int nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops,
unsigned int nf_nat_ipv6_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct));
+7 -7
View File
@@ -14,9 +14,11 @@
struct nft_pktinfo {
struct sk_buff *skb;
struct net *net;
const struct net_device *in;
const struct net_device *out;
const struct nf_hook_ops *ops;
u8 pf;
u8 hook;
u8 nhoff;
u8 thoff;
u8 tprot;
@@ -25,16 +27,15 @@ struct nft_pktinfo {
};
static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
pkt->skb = skb;
pkt->net = pkt->xt.net = state->net;
pkt->in = pkt->xt.in = state->in;
pkt->out = pkt->xt.out = state->out;
pkt->ops = ops;
pkt->xt.hooknum = ops->hooknum;
pkt->xt.family = ops->pf;
pkt->hook = pkt->xt.hooknum = state->hook;
pkt->pf = pkt->xt.family = state->pf;
}
/**
@@ -815,8 +816,7 @@ int nft_register_basechain(struct nft_base_chain *basechain,
void nft_unregister_basechain(struct nft_base_chain *basechain,
unsigned int hook_nops);
unsigned int nft_do_chain(struct nft_pktinfo *pkt,
const struct nf_hook_ops *ops);
unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
/**
* struct nft_table - nf_tables table
+1 -2
View File
@@ -6,13 +6,12 @@
static inline void
nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct iphdr *ip;
nft_set_pktinfo(pkt, ops, skb, state);
nft_set_pktinfo(pkt, skb, state);
ip = ip_hdr(pkt->skb);
pkt->tprot = ip->protocol;
+1 -2
View File
@@ -6,14 +6,13 @@
static inline int
nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
int protohdr, thoff = 0;
unsigned short frag_off;
nft_set_pktinfo(pkt, ops, skb, state);
nft_set_pktinfo(pkt, skb, state);
protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
/* If malformed, drop it */
+1
View File
@@ -5,6 +5,7 @@
struct tcf_connmark_info {
struct tcf_common common;
struct net *net;
u16 zone;
};

Some files were not shown because too many files have changed in this diff Show More