Merge branch 'net-fix-lwtunnel-reentry-loops'

Justin Iurman says:

====================
net: fix lwtunnel reentry loops

When the destination is the same after the transformation, we enter a
lwtunnel loop. This is true for most of lwt users: ioam6, rpl, seg6,
seg6_local, ila_lwt, and lwt_bpf. It can happen in their input() and
output() handlers respectively, where either dst_input() or dst_output()
is called at the end. It can also happen in xmit() handlers.

Here is an example for rpl_input():

dump_stack_lvl+0x60/0x80
rpl_input+0x9d/0x320
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
[...]
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
ip6_sublist_rcv_finish+0x85/0x90
ip6_sublist_rcv+0x236/0x2f0

... until rpl_do_srh() fails, which means skb_cow_head() failed.

This series provides a fix at the core level of lwtunnel to catch such
loops when they're not caught by the respective lwtunnel users, and
handle the loop case in ioam6 which is one of the users. This series
also comes with a new selftest to detect some dst cache reference loops
in lwtunnel users.
====================

Link: https://patch.msgid.link/20250314120048.12569-1-justin.iurman@uliege.be
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni
2025-03-20 11:25:55 +01:00
5 changed files with 306 additions and 16 deletions

View File

@@ -23,6 +23,8 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
#include "dev.h"
DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
@@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
int ret = -EINVAL;
struct dst_entry *dst;
int ret;
if (!dst)
if (dev_xmit_recursion()) {
net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
__func__);
ret = -ENETDOWN;
goto drop;
}
dst = skb_dst(skb);
if (!dst) {
ret = -EINVAL;
goto drop;
}
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -341,8 +353,11 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
if (likely(ops && ops->output))
if (likely(ops && ops->output)) {
dev_xmit_recursion_inc();
ret = ops->output(net, sk, skb);
dev_xmit_recursion_dec();
}
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
@@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
int ret = -EINVAL;
struct dst_entry *dst;
int ret;
if (!dst)
if (dev_xmit_recursion()) {
net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
__func__);
ret = -ENETDOWN;
goto drop;
}
dst = skb_dst(skb);
if (!dst) {
ret = -EINVAL;
goto drop;
}
lwtstate = dst->lwtstate;
@@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
if (likely(ops && ops->xmit))
if (likely(ops && ops->xmit)) {
dev_xmit_recursion_inc();
ret = ops->xmit(skb);
dev_xmit_recursion_dec();
}
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
@@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
int ret = -EINVAL;
struct dst_entry *dst;
int ret;
if (!dst)
if (dev_xmit_recursion()) {
net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
__func__);
ret = -ENETDOWN;
goto drop;
}
dst = skb_dst(skb);
if (!dst) {
ret = -EINVAL;
goto drop;
}
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
if (likely(ops && ops->input))
if (likely(ops && ops->input)) {
dev_xmit_recursion_inc();
ret = ops->input(skb);
dev_xmit_recursion_dec();
}
rcu_read_unlock();
if (ret == -EOPNOTSUPP)

View File

@@ -337,7 +337,6 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL;
struct in6_addr orig_daddr;
struct ioam6_lwt *ilwt;
int err = -EINVAL;
u32 pkt_cnt;
@@ -352,8 +351,6 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
goto out;
orig_daddr = ipv6_hdr(skb)->daddr;
local_bh_disable();
cache_dst = dst_cache_get(&ilwt->cache);
local_bh_enable();
@@ -422,7 +419,10 @@ do_encap:
goto drop;
}
if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
/* avoid lwtunnel_output() reentry loop when destination is the same
* after transformation (e.g., with the inline mode)
*/
if (dst->lwtstate != cache_dst->lwtstate) {
skb_dst_drop(skb);
skb_dst_set(skb, cache_dst);
return dst_output(net, sk, skb);

View File

@@ -101,6 +101,7 @@ TEST_PROGS += vlan_bridge_binding.sh
TEST_PROGS += bpf_offload.py
TEST_PROGS += ipv6_route_update_soft_lockup.sh
TEST_PROGS += busy_poll_test.sh
TEST_PROGS += lwt_dst_cache_ref_loop.sh
# YNL files, must be before "include ..lib.mk"
YNL_GEN_FILES := busy_poller netlink-dumps

View File

@@ -107,3 +107,5 @@ CONFIG_XFRM_INTERFACE=m
CONFIG_XFRM_USER=m
CONFIG_IP_NF_MATCH_RPFILTER=m
CONFIG_IP6_NF_MATCH_RPFILTER=m
CONFIG_IPV6_ILA=m
CONFIG_IPV6_RPL_LWTUNNEL=y

View File

@@ -0,0 +1,246 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
#
# Author: Justin Iurman <justin.iurman@uliege.be>
#
# WARNING
# -------
# This is just a dummy script that triggers encap cases with possible dst cache
# reference loops in affected lwt users (see list below). Some cases are
# pathological configurations for simplicity, others are valid. Overall, we
# don't want this issue to happen, no matter what. In order to catch any
# reference loops, kmemleak MUST be used. The results alone are always blindly
# successful, don't rely on them. Note that the following tests may crash the
# kernel if the fix to prevent lwtunnel_{input|output|xmit}() reentry loops is
# not present.
#
# Affected lwt users so far (please update accordingly if needed):
# - ila_lwt (output only)
# - ioam6_iptunnel (output only)
# - rpl_iptunnel (both input and output)
# - seg6_iptunnel (both input and output)
source lib.sh
check_compatibility()
{
setup_ns tmp_node &>/dev/null
if [ $? != 0 ]; then
echo "SKIP: Cannot create netns."
exit $ksft_skip
fi
ip link add name veth0 netns $tmp_node type veth \
peer name veth1 netns $tmp_node &>/dev/null
local ret=$?
ip -netns $tmp_node link set veth0 up &>/dev/null
ret=$((ret + $?))
ip -netns $tmp_node link set veth1 up &>/dev/null
ret=$((ret + $?))
if [ $ret != 0 ]; then
echo "SKIP: Cannot configure links."
cleanup_ns $tmp_node
exit $ksft_skip
fi
lsmod 2>/dev/null | grep -q "ila"
ila_lsmod=$?
[ $ila_lsmod != 0 ] && modprobe ila &>/dev/null
ip -netns $tmp_node route add 2001:db8:1::/64 \
encap ila 1:2:3:4 csum-mode no-action ident-type luid \
hook-type output \
dev veth0 &>/dev/null
ip -netns $tmp_node route add 2001:db8:2::/64 \
encap ioam6 trace prealloc type 0x800000 ns 0 size 4 \
dev veth0 &>/dev/null
ip -netns $tmp_node route add 2001:db8:3::/64 \
encap rpl segs 2001:db8:3::1 dev veth0 &>/dev/null
ip -netns $tmp_node route add 2001:db8:4::/64 \
encap seg6 mode inline segs 2001:db8:4::1 dev veth0 &>/dev/null
ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ila"
skip_ila=$?
ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ioam6"
skip_ioam6=$?
ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap rpl"
skip_rpl=$?
ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap seg6"
skip_seg6=$?
cleanup_ns $tmp_node
}
setup()
{
setup_ns alpha beta gamma &>/dev/null
ip link add name veth-alpha netns $alpha type veth \
peer name veth-betaL netns $beta &>/dev/null
ip link add name veth-betaR netns $beta type veth \
peer name veth-gamma netns $gamma &>/dev/null
ip -netns $alpha link set veth-alpha name veth0 &>/dev/null
ip -netns $beta link set veth-betaL name veth0 &>/dev/null
ip -netns $beta link set veth-betaR name veth1 &>/dev/null
ip -netns $gamma link set veth-gamma name veth0 &>/dev/null
ip -netns $alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
ip -netns $alpha link set veth0 up &>/dev/null
ip -netns $alpha link set lo up &>/dev/null
ip -netns $alpha route add 2001:db8:2::/64 \
via 2001:db8:1::1 dev veth0 &>/dev/null
ip -netns $beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null
ip -netns $beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null
ip -netns $beta link set veth0 up &>/dev/null
ip -netns $beta link set veth1 up &>/dev/null
ip -netns $beta link set lo up &>/dev/null
ip -netns $beta route del 2001:db8:2::/64
ip -netns $beta route add 2001:db8:2::/64 dev veth1
ip netns exec $beta \
sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null
ip -netns $gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null
ip -netns $gamma link set veth0 up &>/dev/null
ip -netns $gamma link set lo up &>/dev/null
ip -netns $gamma route add 2001:db8:1::/64 \
via 2001:db8:2::1 dev veth0 &>/dev/null
sleep 1
ip netns exec $alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null
if [ $? != 0 ]; then
echo "SKIP: Setup failed."
exit $ksft_skip
fi
sleep 1
}
cleanup()
{
cleanup_ns $alpha $beta $gamma
[ $ila_lsmod != 0 ] && modprobe -r ila &>/dev/null
}
run_ila()
{
if [ $skip_ila != 0 ]; then
echo "SKIP: ila (output)"
return
fi
ip -netns $beta route del 2001:db8:2::/64
ip -netns $beta route add 2001:db8:2:0:0:0:0:2/128 \
encap ila 2001:db8:2:0 csum-mode no-action ident-type luid \
hook-type output \
dev veth1 &>/dev/null
sleep 1
echo "TEST: ila (output)"
ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
sleep 1
ip -netns $beta route del 2001:db8:2:0:0:0:0:2/128
ip -netns $beta route add 2001:db8:2::/64 dev veth1
sleep 1
}
run_ioam6()
{
if [ $skip_ioam6 != 0 ]; then
echo "SKIP: ioam6 (output)"
return
fi
ip -netns $beta route change 2001:db8:2::/64 \
encap ioam6 trace prealloc type 0x800000 ns 1 size 4 \
dev veth1 &>/dev/null
sleep 1
echo "TEST: ioam6 (output)"
ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
sleep 1
}
run_rpl()
{
if [ $skip_rpl != 0 ]; then
echo "SKIP: rpl (input)"
echo "SKIP: rpl (output)"
return
fi
ip -netns $beta route change 2001:db8:2::/64 \
encap rpl segs 2001:db8:2::2 \
dev veth1 &>/dev/null
sleep 1
echo "TEST: rpl (input)"
ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
sleep 1
echo "TEST: rpl (output)"
ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
sleep 1
}
run_seg6()
{
if [ $skip_seg6 != 0 ]; then
echo "SKIP: seg6 (input)"
echo "SKIP: seg6 (output)"
return
fi
ip -netns $beta route change 2001:db8:2::/64 \
encap seg6 mode inline segs 2001:db8:2::2 \
dev veth1 &>/dev/null
sleep 1
echo "TEST: seg6 (input)"
ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
sleep 1
echo "TEST: seg6 (output)"
ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
sleep 1
}
run()
{
run_ila
run_ioam6
run_rpl
run_seg6
}
if [ "$(id -u)" -ne 0 ]; then
echo "SKIP: Need root privileges."
exit $ksft_skip
fi
if [ ! -x "$(command -v ip)" ]; then
echo "SKIP: Could not run test without ip tool."
exit $ksft_skip
fi
check_compatibility
trap cleanup EXIT
setup
run
exit $ksft_pass