From 2c87309ea741341c6722efdf1fb3f50dd427c823 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Tue, 29 Oct 2024 19:27:12 +0100 Subject: [PATCH 01/57] ieee802154: ca8210: Add missing check for kfifo_alloc() in ca8210_probe() ca8210_test_interface_init() returns the result of kfifo_alloc(), which can be non-zero in case of an error. The caller, ca8210_probe(), should check the return value and do error-handling if it fails. Fixes: ded845a781a5 ("ieee802154: Add CA8210 IEEE 802.15.4 device driver") Signed-off-by: Keisuke Nishimura Reviewed-by: Simon Horman Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241029182712.318271-1-keisuke.nishimura@inria.fr Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index e685a7f946f0..753215ebc67c 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3072,7 +3072,11 @@ static int ca8210_probe(struct spi_device *spi_device) spi_set_drvdata(priv->spi, priv); if (IS_ENABLED(CONFIG_IEEE802154_CA8210_DEBUGFS)) { cascoda_api_upstream = ca8210_test_int_driver_write; - ca8210_test_interface_init(priv); + ret = ca8210_test_interface_init(priv); + if (ret) { + dev_crit(&spi_device->dev, "ca8210_test_interface_init failed\n"); + goto error; + } } else { cascoda_api_upstream = NULL; } From eb09fbeb48709fe66c0d708aed81e910a577a30a Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Wed, 13 Nov 2024 17:51:29 +0800 Subject: [PATCH 02/57] mac802154: check local interfaces before deleting sdata list syzkaller reported a corrupted list in ieee802154_if_remove. [1] Remove an IEEE 802.15.4 network interface after unregister an IEEE 802.15.4 hardware device from the system. CPU0 CPU1 ==== ==== genl_family_rcv_msg_doit ieee802154_unregister_hw ieee802154_del_iface ieee802154_remove_interfaces rdev_del_virtual_intf_deprecated list_del(&sdata->list) ieee802154_if_remove list_del_rcu The net device has been unregistered, since the rcu grace period, unregistration must be run before ieee802154_if_remove. To avoid this issue, add a check for local->interfaces before deleting sdata list. [1] kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6277 Comm: syz-executor157 Not tainted 6.12.0-rc6-syzkaller-00005-g557329bcecc2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__list_del_entry_valid_or_report+0xf4/0x140 lib/list_debug.c:56 Code: e8 a1 7e 00 07 90 0f 0b 48 c7 c7 e0 37 60 8c 4c 89 fe e8 8f 7e 00 07 90 0f 0b 48 c7 c7 40 38 60 8c 4c 89 fe e8 7d 7e 00 07 90 <0f> 0b 48 c7 c7 a0 38 60 8c 4c 89 fe e8 6b 7e 00 07 90 0f 0b 48 c7 RSP: 0018:ffffc9000490f3d0 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: d211eee56bb28d00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88805b278dd8 R08: ffffffff8174a12c R09: 1ffffffff2852f0d R10: dffffc0000000000 R11: fffffbfff2852f0e R12: dffffc0000000000 R13: dffffc0000000000 R14: dead000000000100 R15: ffff88805b278cc0 FS: 0000555572f94380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056262e4a3000 CR3: 0000000078496000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:157 [inline] ieee802154_if_remove+0x86/0x1e0 net/mac802154/iface.c:687 rdev_del_virtual_intf_deprecated net/ieee802154/rdev-ops.h:24 [inline] ieee802154_del_iface+0x2c0/0x5c0 net/ieee802154/nl-phy.c:323 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:744 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2607 ___sys_sendmsg net/socket.c:2661 [inline] __sys_sendmsg+0x292/0x380 net/socket.c:2690 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-and-tested-by: syzbot+985f827280dc3a6e7e92@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=985f827280dc3a6e7e92 Signed-off-by: Lizhi Xu Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241113095129.1457225-1-lizhi.xu@windriver.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index c0e2da5072be..9e4631fade90 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); + if (list_empty(&sdata->local->interfaces)) { + mutex_unlock(&sdata->local->iflist_mtx); + return; + } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); From 1e9b0e1c550c42c13c111d1a31e822057232abc4 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Thu, 2 Jan 2025 20:23:00 -0500 Subject: [PATCH 03/57] net: 802: LLC+SNAP OID:PID lookup on start of skb data 802.2+LLC+SNAP frames received by napi_complete_done() with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. This was an issue as snap_rcv() expected offset to point to SNAP header (OID:PID), causing packet to be dropped. A fix at llc_fixup_skb() (a024e377efed) resets transport_header for any LLC consumers that may care about it, and stops SNAP packets from being dropped, but doesn't fix the problem which is that LLC and SNAP should not use transport_header offset. Ths patch eliminates the use of transport_header offset for SNAP lookup of OID:PID so that SNAP does not rely on the offset at all. The offset is reset after pull for any SNAP packet consumers that may (but shouldn't) use it. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103012303.746521-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/802/psnap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/802/psnap.c b/net/802/psnap.c index fca9d454905f..389df460c8c4 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; rcu_read_lock(); - proto = find_snap_client(skb_transport_header(skb)); + proto = find_snap_client(skb->data); if (proto) { /* Pass the frame on. */ - skb->transport_header += 5; skb_pull_rcsum(skb, 5); + skb_reset_transport_header(skb); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); From 3479c7549fb1dfa7a1db4efb7347c7b8ef50de4b Mon Sep 17 00:00:00 2001 From: Zhongqiu Duan Date: Thu, 2 Jan 2025 17:14:26 +0000 Subject: [PATCH 04/57] tcp/dccp: allow a connection when sk_max_ack_backlog is zero If the backlog of listen() is set to zero, sk_acceptq_is_full() allows one connection to be made, but inet_csk_reqsk_queue_is_full() does not. When the net.ipv4.tcp_syncookies is zero, inet_csk_reqsk_queue_is_full() will cause an immediate drop before the sk_acceptq_is_full() check in tcp_conn_request(), resulting in no connection can be made. This patch tries to keep consistent with 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes."). Link: https://lore.kernel.org/netdev/20250102080258.53858-1-kuniyu@amazon.com/ Fixes: ef547f2ac16b ("tcp: remove max_qlen_log") Signed-off-by: Zhongqiu Duan Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250102171426.915276-1-dzq.aishenghu0@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3c82fad904d4..c7f42844c79a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog); + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); From a039e54397c6a75b713b9ce7894a62e06956aa92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 10:45:46 +0000 Subject: [PATCH 05/57] net_sched: cls_flow: validate TCA_FLOW_RSHIFT attribute syzbot found that TCA_FLOW_RSHIFT attribute was not validated. Right shitfing a 32bit integer is undefined for large shift values. UBSAN: shift-out-of-bounds in net/sched/cls_flow.c:329:23 shift exponent 9445 is too large for 32-bit type 'u32' (aka 'unsigned int') CPU: 1 UID: 0 PID: 54 Comm: kworker/u8:3 Not tainted 6.13.0-rc3-syzkaller-00180-g4f619d518db9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_shift_out_of_bounds+0x3c8/0x420 lib/ubsan.c:468 flow_classify+0x24d5/0x25b0 net/sched/cls_flow.c:329 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1771 [inline] tcf_classify+0x420/0x1160 net/sched/cls_api.c:1867 sfb_classify net/sched/sch_sfb.c:260 [inline] sfb_enqueue+0x3ad/0x18b0 net/sched/sch_sfb.c:318 dev_qdisc_enqueue+0x4b/0x290 net/core/dev.c:3793 __dev_xmit_skb net/core/dev.c:3889 [inline] __dev_queue_xmit+0xf0e/0x3f50 net/core/dev.c:4400 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 iptunnel_xmit+0x55d/0x9b0 net/ipv4/ip_tunnel_core.c:82 udp_tunnel_xmit_skb+0x262/0x3b0 net/ipv4/udp_tunnel_core.c:173 geneve_xmit_skb drivers/net/geneve.c:916 [inline] geneve_xmit+0x21dc/0x2d00 drivers/net/geneve.c:1039 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x27a/0x7d0 net/core/dev.c:3606 __dev_queue_xmit+0x1b73/0x3f50 net/core/dev.c:4434 Fixes: e5dfb815181f ("[NET_SCHED]: Add flow classifier") Reported-by: syzbot+1dbb57d994e54aaa04d2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6777bf49.050a0220.178762.0040.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250103104546.3714168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace7..5c2580a07530 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, From e95274dfe86490ec2a5633035c24b2de6722841f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:24:58 -0800 Subject: [PATCH 06/57] selftests: tc-testing: reduce rshift value After previous change rshift >= 32 is no longer allowed. Modify the test to use 31, the test doesn't seem to send any traffic so the exact value shouldn't matter. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103182458.1213486-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tc-tests/filters/flow.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json index 996448afe31b..91d120548bf5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json @@ -78,10 +78,10 @@ "setup": [ "$TC qdisc add dev $DEV1 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0xff", + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0x1f", "expExitCode": "0", "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol ip prio 1 flow", - "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 255 baseclass", + "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 31 baseclass", "matchCount": "1", "teardown": [ "$TC qdisc del dev $DEV1 ingress" From 8ce4f287524c74a118b0af1eebd4b24a8efca57a Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 3 Jan 2025 16:10:13 +0800 Subject: [PATCH 07/57] net: libwx: fix firmware mailbox abnormal return The existing SW-FW interaction flow on the driver is wrong. Follow this wrong flow, driver would never return error if there is a unknown command. Since firmware writes back 'firmware ready' and 'unknown command' in the mailbox message if there is an unknown command sent by driver. So reading 'firmware ready' does not timeout. Then driver would mistakenly believe that the interaction has completed successfully. It tends to happen with the use of custom firmware. Move the check for 'unknown command' out of the poll timeout for 'firmware ready'. And adjust the debug log so that mailbox messages are always printed when commands timeout. Fixes: 1efa9bfe58c5 ("net: libwx: Implement interaction with firmware") Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250103081013.1995939-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 1bf9c38e4125..deaf670c160e 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -334,27 +334,25 @@ int wx_host_interface_command(struct wx *wx, u32 *buffer, status = read_poll_timeout(rd32, hicr, hicr & WX_MNG_MBOX_CTL_FWRDY, 1000, timeout * 1000, false, wx, WX_MNG_MBOX_CTL); + buf[0] = rd32(wx, WX_MNG_MBOX); + if ((buf[0] & 0xff0000) >> 16 == 0x80) { + wx_err(wx, "Unknown FW command: 0x%x\n", buffer[0] & 0xff); + status = -EINVAL; + goto rel_out; + } + /* Check command completion */ if (status) { - wx_dbg(wx, "Command has failed with no status valid.\n"); - - buf[0] = rd32(wx, WX_MNG_MBOX); - if ((buffer[0] & 0xff) != (~buf[0] >> 24)) { - status = -EINVAL; - goto rel_out; - } - if ((buf[0] & 0xff0000) >> 16 == 0x80) { - wx_dbg(wx, "It's unknown cmd.\n"); - status = -EINVAL; - goto rel_out; - } - + wx_err(wx, "Command has failed with no status valid.\n"); wx_dbg(wx, "write value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buffer[i]); wx_dbg(wx, "read value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buf[i]); + wx_dbg(wx, "\ncheck: %x %x\n", buffer[0] & 0xff, ~buf[0] >> 24); + + goto rel_out; } if (!return_data) From 8c817eb26230dc0ae553cee16ff43a4a895f6756 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 3 Jan 2025 11:51:47 -0800 Subject: [PATCH 08/57] pds_core: limit loop over fw name list Add an array size limit to the for-loop to be sure we don't try to reference a fw_version string off the end of the fw info names array. We know that our firmware only has a limited number of firmware slot names, but we shouldn't leave this unchecked. Fixes: 45d76f492938 ("pds_core: set up device and adminq") Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Reviewed-by: Brett Creeley Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250103195147.7408-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index 2681889162a2..44971e71991f 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -118,7 +118,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (err && err != -EIO) return err; - listlen = fw_list.num_fw_slots; + listlen = min(fw_list.num_fw_slots, ARRAY_SIZE(fw_list.fw_names)); for (i = 0; i < listlen; i++) { if (i < ARRAY_SIZE(fw_slotnames)) strscpy(buf, fw_slotnames[i], sizeof(buf)); From c8dafb0e4398dacc362832098a04b97da3b0395b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Fri, 3 Jan 2025 20:38:47 -0800 Subject: [PATCH 09/57] bnxt_en: Fix possible memory leak when hwrm_req_replace fails When hwrm_req_replace() fails, the driver is not invoking bnxt_req_drop() which could cause a memory leak. Fixes: bbf33d1d9805 ("bnxt_en: update all firmware calls to use the new APIs") Reviewed-by: Pavan Chebbi Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250104043849.3482067-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index b771c84cdd89..0ed26e3a28f4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -208,7 +208,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, rc = hwrm_req_replace(bp, req, fw_msg->msg, fw_msg->msg_len); if (rc) - return rc; + goto drop_req; hwrm_req_timeout(bp, req, fw_msg->timeout); resp = hwrm_req_hold(bp, req); @@ -220,6 +220,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, memcpy(fw_msg->resp, resp, resp_len); } +drop_req: hwrm_req_drop(bp, req); return rc; } From 40452969a50652e3cbf89dac83d54eebf2206d27 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 3 Jan 2025 20:38:48 -0800 Subject: [PATCH 10/57] bnxt_en: Fix DIM shutdown DIM work will call the firmware to adjust the coalescing parameters on the RX rings. We should cancel DIM work before we call the firmware to free the RX rings. Otherwise, FW will reject the call from DIM work if the RX ring has been freed. This will generate an error message like this: bnxt_en 0000:21:00.1 ens2f1np1: hwrm req_type 0x53 seq id 0x6fca error 0x2 and cause unnecessary concern for the user. It is also possible to modify the coalescing parameters of the wrong ring if the ring has been re-allocated. To prevent this, cancel DIM work right before freeing the RX rings. We also have to add a check in NAPI poll to not schedule DIM if the RX rings are shutting down. Check that the VNIC is active before we schedule DIM. The VNIC is always disabled before we free the RX rings. Fixes: 0bc0b97fca73 ("bnxt_en: cleanup DIM work on device shutdown") Reviewed-by: Hongguang Gao Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250104043849.3482067-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 38 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b86f980fa7ea..aeaa74f03046 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2897,6 +2897,13 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp) return 0; } +static bool bnxt_vnic_is_active(struct bnxt *bp) +{ + struct bnxt_vnic_info *vnic = &bp->vnic_info[0]; + + return vnic->fw_vnic_id != INVALID_HW_RING_ID && vnic->mru > 0; +} + static irqreturn_t bnxt_msix(int irq, void *dev_instance) { struct bnxt_napi *bnapi = dev_instance; @@ -3164,7 +3171,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget) break; } } - if (bp->flags & BNXT_FLAG_DIM) { + if ((bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -3295,7 +3302,7 @@ static int bnxt_poll_p5(struct napi_struct *napi, int budget) poll_done: cpr_rx = &cpr->cp_ring_arr[0]; if (cpr_rx->cp_ring_type == BNXT_NQ_HDL_TYPE_RX && - (bp->flags & BNXT_FLAG_DIM)) { + (bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -7266,6 +7273,26 @@ err_out: return rc; } +static void bnxt_cancel_dim(struct bnxt *bp) +{ + int i; + + /* DIM work is initialized in bnxt_enable_napi(). Proceed only + * if NAPI is enabled. + */ + if (!bp->bnapi || test_bit(BNXT_STATE_NAPI_DISABLED, &bp->state)) + return; + + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); + for (i = 0; i < bp->rx_nr_rings; i++) { + struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i]; + struct bnxt_napi *bnapi = rxr->bnapi; + + cancel_work_sync(&bnapi->cp_ring.dim.work); + } +} + static int hwrm_ring_free_send_msg(struct bnxt *bp, struct bnxt_ring_struct *ring, u32 ring_type, int cmpl_ring_id) @@ -7366,6 +7393,7 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) } } + bnxt_cancel_dim(bp); for (i = 0; i < bp->rx_nr_rings; i++) { bnxt_hwrm_rx_ring_free(bp, &bp->rx_ring[i], close_path); bnxt_hwrm_rx_agg_ring_free(bp, &bp->rx_ring[i], close_path); @@ -11309,8 +11337,6 @@ static void bnxt_disable_napi(struct bnxt *bp) if (bnapi->in_reset) cpr->sw_stats->rx.rx_resets++; napi_disable(&bnapi->napi); - if (bnapi->rx_ring) - cancel_work_sync(&cpr->dim.work); } } @@ -15572,8 +15598,10 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_vnic_update(bp, vnic, VNIC_UPDATE_REQ_ENABLES_MRU_VALID); } - + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); rxr = &bp->rx_ring[idx]; + cancel_work_sync(&rxr->bnapi->cp_ring.dim.work); bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); rxr->rx_next_cons = 0; From 4c1224501e9d6c5fd12d83752f1c1b444e0e3418 Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Fri, 3 Jan 2025 14:53:27 +0530 Subject: [PATCH 11/57] cxgb4: Avoid removal of uninserted tid During ARP failure, tid is not inserted but _c4iw_free_ep() attempts to remove tid which results in error. This patch fixes the issue by avoiding removal of uninserted tid. Fixes: 59437d78f088 ("cxgb4/chtls: fix ULD connection failures due to wrong TID base") Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20250103092327.1011925-1-anumula@chelsio.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc3af0054406..604dcfd49aa4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1799,7 +1799,10 @@ void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid, struct adapter *adap = container_of(t, struct adapter, tids); struct sk_buff *skb; - WARN_ON(tid_out_of_range(&adap->tids, tid)); + if (tid_out_of_range(&adap->tids, tid)) { + dev_err(adap->pdev_dev, "tid %d out of range\n", tid); + return; + } if (t->tid_tab[tid - adap->tids.tid_base]) { t->tid_tab[tid - adap->tids.tid_base] = NULL; From fd48f071a3d6d51e737e953bb43fe69785cf59a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:32:07 -0800 Subject: [PATCH 12/57] net: don't dump Tx and uninitialized NAPIs We use NAPI ID as the key for continuing dumps. We also depend on the NAPIs being sorted by ID within the driver list. Tx NAPIs (which don't have an ID assigned) break this expectation, it's not currently possible to dump them reliably. Since Tx NAPIs are relatively rare, and can't be used in doit (GET or SET) hide them from the dump API as well. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103183207.1216004-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/netdev-genl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b0772d135efb..125b660004d3 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -176,8 +176,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (!hdr) return -EMSGSIZE; - if (napi->napi_id >= MIN_NAPI_ID && - nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) @@ -272,6 +271,8 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (napi->napi_id < MIN_NAPI_ID) + continue; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; From 65104599b3a8ed42d85b3f8f27be650afe1f3a7e Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Wed, 20 Nov 2024 08:51:12 +0100 Subject: [PATCH 13/57] ice: fix max values for dpll pin phase adjust Mask admin command returned max phase adjust value for both input and output pins. Only 31 bits are relevant, last released data sheet wrongly points that 32 bits are valid - see [1] 3.2.6.4.1 Get CCU Capabilities Command for reference. Fix of the datasheet itself is in progress. Fix the min/max assignment logic, previously the value was wrongly considered as negative value due to most significant bit being set. Example of previous broken behavior: $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/dpll.yaml \ --do pin-get --json '{"id":1}'| grep phase-adjust 'phase-adjust': 0, 'phase-adjust-max': 16723, 'phase-adjust-min': -16723, Correct behavior with the fix: $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/dpll.yaml \ --do pin-get --json '{"id":1}'| grep phase-adjust 'phase-adjust': 0, 'phase-adjust-max': 2147466925, 'phase-adjust-min': -2147466925, [1] https://cdrdv2.intel.com/v1/dl/getContent/613875?explicitVersion=true Fixes: 90e1c90750d7 ("ice: dpll: implement phase related callbacks") Reviewed-by: Przemek Kitszel Signed-off-by: Arkadiusz Kubalewski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_adminq_cmd.h | 2 ++ drivers/net/ethernet/intel/ice/ice_dpll.c | 35 ++++++++++++------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 1489a8ceec51..ef14cff9a333 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2264,6 +2264,8 @@ struct ice_aqc_get_pkg_info_resp { struct ice_aqc_get_pkg_info pkg_info[]; }; +#define ICE_AQC_GET_CGU_MAX_PHASE_ADJ GENMASK(30, 0) + /* Get CGU abilities command response data structure (indirect 0x0C61) */ struct ice_aqc_get_cgu_abilities { u8 num_inputs; diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index d5ad6d84007c..38e151c7ea23 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2064,6 +2064,18 @@ static int ice_dpll_init_worker(struct ice_pf *pf) return 0; } +/** + * ice_dpll_phase_range_set - initialize phase adjust range helper + * @range: pointer to phase adjust range struct to be initialized + * @phase_adj: a value to be used as min(-)/max(+) boundary + */ +static void ice_dpll_phase_range_set(struct dpll_pin_phase_adjust_range *range, + u32 phase_adj) +{ + range->min = -phase_adj; + range->max = phase_adj; +} + /** * ice_dpll_init_info_pins_generic - initializes generic pins info * @pf: board private structure @@ -2105,8 +2117,8 @@ static int ice_dpll_init_info_pins_generic(struct ice_pf *pf, bool input) for (i = 0; i < pin_num; i++) { pins[i].idx = i; pins[i].prop.board_label = labels[i]; - pins[i].prop.phase_range.min = phase_adj_max; - pins[i].prop.phase_range.max = -phase_adj_max; + ice_dpll_phase_range_set(&pins[i].prop.phase_range, + phase_adj_max); pins[i].prop.capabilities = cap; pins[i].pf = pf; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); @@ -2152,6 +2164,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, struct ice_hw *hw = &pf->hw; struct ice_dpll_pin *pins; unsigned long caps; + u32 phase_adj_max; u8 freq_supp_num; bool input; @@ -2159,11 +2172,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, case ICE_DPLL_PIN_TYPE_INPUT: pins = pf->dplls.inputs; num_pins = pf->dplls.num_inputs; + phase_adj_max = pf->dplls.input_phase_adj_max; input = true; break; case ICE_DPLL_PIN_TYPE_OUTPUT: pins = pf->dplls.outputs; num_pins = pf->dplls.num_outputs; + phase_adj_max = pf->dplls.output_phase_adj_max; input = false; break; default: @@ -2188,19 +2203,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, return ret; caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE | DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE); - pins[i].prop.phase_range.min = - pf->dplls.input_phase_adj_max; - pins[i].prop.phase_range.max = - -pf->dplls.input_phase_adj_max; } else { - pins[i].prop.phase_range.min = - pf->dplls.output_phase_adj_max; - pins[i].prop.phase_range.max = - -pf->dplls.output_phase_adj_max; ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps); if (ret) return ret; } + ice_dpll_phase_range_set(&pins[i].prop.phase_range, + phase_adj_max); pins[i].prop.capabilities = caps; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); if (ret) @@ -2308,8 +2317,10 @@ static int ice_dpll_init_info(struct ice_pf *pf, bool cgu) dp->dpll_idx = abilities.pps_dpll_idx; d->num_inputs = abilities.num_inputs; d->num_outputs = abilities.num_outputs; - d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj); - d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj); + d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj) & + ICE_AQC_GET_CGU_MAX_PHASE_ADJ; + d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj) & + ICE_AQC_GET_CGU_MAX_PHASE_ADJ; alloc_size = sizeof(*d->inputs) * d->num_inputs; d->inputs = kzalloc(alloc_size, GFP_KERNEL); From 6c5b989116083a98f45aada548ff54e7a83a9c2d Mon Sep 17 00:00:00 2001 From: Przemyslaw Korba Date: Wed, 4 Dec 2024 14:22:18 +0100 Subject: [PATCH 14/57] ice: fix incorrect PHY settings for 100 GB/s ptp4l application reports too high offset when ran on E823 device with a 100GB/s link. Those values cannot go under 100ns, like in a working case when using 100 GB/s cable. This is due to incorrect frequency settings on the PHY clocks for 100 GB/s speed. Changes are introduced to align with the internal hardware documentation, and correctly initialize frequency in PHY clocks with the frequency values that are in our HW spec. To reproduce the issue run ptp4l as a Time Receiver on E823 device, and observe the offset, which will never approach values seen in the PTP working case. Reproduction output: ptp4l -i enp137s0f3 -m -2 -s -f /etc/ptp4l_8275.conf ptp4l[5278.775]: master offset 12470 s2 freq +41288 path delay -3002 ptp4l[5278.837]: master offset 10525 s2 freq +39202 path delay -3002 ptp4l[5278.900]: master offset -24840 s2 freq -20130 path delay -3002 ptp4l[5278.963]: master offset 10597 s2 freq +37908 path delay -3002 ptp4l[5279.025]: master offset 8883 s2 freq +36031 path delay -3002 ptp4l[5279.088]: master offset 7267 s2 freq +34151 path delay -3002 ptp4l[5279.150]: master offset 5771 s2 freq +32316 path delay -3002 ptp4l[5279.213]: master offset 4388 s2 freq +30526 path delay -3002 ptp4l[5279.275]: master offset -30434 s2 freq -28485 path delay -3002 ptp4l[5279.338]: master offset -28041 s2 freq -27412 path delay -3002 ptp4l[5279.400]: master offset 7870 s2 freq +31118 path delay -3002 Fixes: 3a7496234d17 ("ice: implement basic E822 PTP support") Reviewed-by: Milena Olech Signed-off-by: Przemyslaw Korba Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp_consts.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h index 585ce200c60f..d75f0eddd631 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h @@ -761,9 +761,9 @@ const struct ice_vernier_info_e82x e822_vernier[NUM_ICE_PTP_LNK_SPD] = { /* rx_desk_rsgb_par */ 644531250, /* 644.53125 MHz Reed Solomon gearbox */ /* tx_desk_rsgb_pcs */ - 644531250, /* 644.53125 MHz Reed Solomon gearbox */ + 390625000, /* 390.625 MHz Reed Solomon gearbox */ /* rx_desk_rsgb_pcs */ - 644531250, /* 644.53125 MHz Reed Solomon gearbox */ + 390625000, /* 390.625 MHz Reed Solomon gearbox */ /* tx_fixed_delay */ 1620, /* pmd_adj_divisor */ From bd2776e39c2a82ef4681d02678bb77b3d41e79be Mon Sep 17 00:00:00 2001 From: En-Wei Wu Date: Wed, 18 Dec 2024 10:37:42 +0800 Subject: [PATCH 15/57] igc: return early when failing to read EECD register When booting with a dock connected, the igc driver may get stuck for ~40 seconds if PCIe link is lost during initialization. This happens because the driver access device after EECD register reads return all F's, indicating failed reads. Consequently, hw->hw_addr is set to NULL, which impacts subsequent rd32() reads. This leads to the driver hanging in igc_get_hw_semaphore_i225(), as the invalid hw->hw_addr prevents retrieving the expected value. To address this, a validation check and a corresponding return value catch is added for the EECD register read result. If all F's are returned, indicating PCIe link loss, the driver will return -ENXIO immediately. This avoids the 40-second hang and significantly improves boot time when using a dock with an igc NIC. Log before the patch: [ 0.911913] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 0.912386] igc 0000:70:00.0: PTM enabled, 4ns granularity [ 1.571098] igc 0000:70:00.0 (unnamed net_device) (uninitialized): PCIe link lost, device now detached [ 43.449095] igc_get_hw_semaphore_i225: igc 0000:70:00.0 (unnamed net_device) (uninitialized): Driver can't access device - SMBI bit is set. [ 43.449186] igc 0000:70:00.0: probe with driver igc failed with error -13 [ 46.345701] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 46.345777] igc 0000:70:00.0: PTM enabled, 4ns granularity Log after the patch: [ 1.031000] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 1.032097] igc 0000:70:00.0: PTM enabled, 4ns granularity [ 1.642291] igc 0000:70:00.0 (unnamed net_device) (uninitialized): PCIe link lost, device now detached [ 5.480490] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 5.480516] igc 0000:70:00.0: PTM enabled, 4ns granularity Fixes: ab4056126813 ("igc: Add NVM support") Cc: Chia-Lin Kao (AceLan) Signed-off-by: En-Wei Wu Reviewed-by: Vitaly Lifshits Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_base.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c index 9fae8bdec2a7..1613b562d17c 100644 --- a/drivers/net/ethernet/intel/igc/igc_base.c +++ b/drivers/net/ethernet/intel/igc/igc_base.c @@ -68,6 +68,10 @@ static s32 igc_init_nvm_params_base(struct igc_hw *hw) u32 eecd = rd32(IGC_EECD); u16 size; + /* failed to read reg and got all F's */ + if (!(~eecd)) + return -ENXIO; + size = FIELD_GET(IGC_EECD_SIZE_EX_MASK, eecd); /* Added to a constant, "size" becomes the left-shift value @@ -221,6 +225,8 @@ static s32 igc_get_invariants_base(struct igc_hw *hw) /* NVM initialization */ ret_val = igc_init_nvm_params_base(hw); + if (ret_val) + goto out; switch (hw->mac.type) { case igc_i225: ret_val = igc_init_nvm_params_i225(hw); From 95978931d55fb7685f8c0b2598d6c12a9b6bc82a Mon Sep 17 00:00:00 2001 From: Su Hui Date: Mon, 6 Jan 2025 10:36:48 +0800 Subject: [PATCH 16/57] eth: fbnic: Revert "eth: fbnic: Add hardware monitoring support via HWMON interface" There is a garbage value problem in fbnic_mac_get_sensor_asic(). 'fw_cmpl' is uninitialized which makes 'sensor' and '*val' to be stored garbage value. Revert commit d85ebade02e8 ("eth: fbnic: Add hardware monitoring support via HWMON interface") to avoid this problem. Fixes: d85ebade02e8 ("eth: fbnic: Add hardware monitoring support via HWMON interface") Signed-off-by: Su Hui Suggested-by: Jakub Kicinski Suggested-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106023647.47756-1-suhui@nfschina.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/Makefile | 1 - drivers/net/ethernet/meta/fbnic/fbnic.h | 5 -- drivers/net/ethernet/meta/fbnic/fbnic_fw.h | 7 -- drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c | 81 ------------------- drivers/net/ethernet/meta/fbnic/fbnic_mac.c | 22 ----- drivers/net/ethernet/meta/fbnic/fbnic_mac.h | 7 -- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 3 - 7 files changed, 126 deletions(-) delete mode 100644 drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c diff --git a/drivers/net/ethernet/meta/fbnic/Makefile b/drivers/net/ethernet/meta/fbnic/Makefile index 239b2258ec65..ea6214ca48e7 100644 --- a/drivers/net/ethernet/meta/fbnic/Makefile +++ b/drivers/net/ethernet/meta/fbnic/Makefile @@ -13,7 +13,6 @@ fbnic-y := fbnic_csr.o \ fbnic_ethtool.o \ fbnic_fw.o \ fbnic_hw_stats.o \ - fbnic_hwmon.o \ fbnic_irq.o \ fbnic_mac.o \ fbnic_netdev.o \ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 706ae6104c8e..744eb0d95449 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -20,7 +20,6 @@ struct fbnic_dev { struct device *dev; struct net_device *netdev; struct dentry *dbg_fbd; - struct device *hwmon; u32 __iomem *uc_addr0; u32 __iomem *uc_addr4; @@ -33,7 +32,6 @@ struct fbnic_dev { struct fbnic_fw_mbx mbx[FBNIC_IPC_MBX_INDICES]; struct fbnic_fw_cap fw_cap; - struct fbnic_fw_completion *cmpl_data; /* Lock protecting Tx Mailbox queue to prevent possible races */ spinlock_t fw_tx_lock; @@ -142,9 +140,6 @@ void fbnic_devlink_unregister(struct fbnic_dev *fbd); int fbnic_fw_enable_mbx(struct fbnic_dev *fbd); void fbnic_fw_disable_mbx(struct fbnic_dev *fbd); -void fbnic_hwmon_register(struct fbnic_dev *fbd); -void fbnic_hwmon_unregister(struct fbnic_dev *fbd); - int fbnic_pcs_irq_enable(struct fbnic_dev *fbd); void fbnic_pcs_irq_disable(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index 7cd8841920e4..221faf8c6756 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -44,13 +44,6 @@ struct fbnic_fw_cap { u8 link_fec; }; -struct fbnic_fw_completion { - struct { - s32 millivolts; - s32 millidegrees; - } tsene; -}; - void fbnic_mbx_init(struct fbnic_dev *fbd); void fbnic_mbx_clean(struct fbnic_dev *fbd); void fbnic_mbx_poll(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c b/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c deleted file mode 100644 index bcd1086e3768..000000000000 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ - -#include - -#include "fbnic.h" -#include "fbnic_mac.h" - -static int fbnic_hwmon_sensor_id(enum hwmon_sensor_types type) -{ - if (type == hwmon_temp) - return FBNIC_SENSOR_TEMP; - if (type == hwmon_in) - return FBNIC_SENSOR_VOLTAGE; - - return -EOPNOTSUPP; -} - -static umode_t fbnic_hwmon_is_visible(const void *drvdata, - enum hwmon_sensor_types type, - u32 attr, int channel) -{ - if (type == hwmon_temp && attr == hwmon_temp_input) - return 0444; - if (type == hwmon_in && attr == hwmon_in_input) - return 0444; - - return 0; -} - -static int fbnic_hwmon_read(struct device *dev, enum hwmon_sensor_types type, - u32 attr, int channel, long *val) -{ - struct fbnic_dev *fbd = dev_get_drvdata(dev); - const struct fbnic_mac *mac = fbd->mac; - int id; - - id = fbnic_hwmon_sensor_id(type); - return id < 0 ? id : mac->get_sensor(fbd, id, val); -} - -static const struct hwmon_ops fbnic_hwmon_ops = { - .is_visible = fbnic_hwmon_is_visible, - .read = fbnic_hwmon_read, -}; - -static const struct hwmon_channel_info *fbnic_hwmon_info[] = { - HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), - HWMON_CHANNEL_INFO(in, HWMON_I_INPUT), - NULL -}; - -static const struct hwmon_chip_info fbnic_chip_info = { - .ops = &fbnic_hwmon_ops, - .info = fbnic_hwmon_info, -}; - -void fbnic_hwmon_register(struct fbnic_dev *fbd) -{ - if (!IS_REACHABLE(CONFIG_HWMON)) - return; - - fbd->hwmon = hwmon_device_register_with_info(fbd->dev, "fbnic", - fbd, &fbnic_chip_info, - NULL); - if (IS_ERR(fbd->hwmon)) { - dev_notice(fbd->dev, - "Failed to register hwmon device %pe\n", - fbd->hwmon); - fbd->hwmon = NULL; - } -} - -void fbnic_hwmon_unregister(struct fbnic_dev *fbd) -{ - if (!IS_REACHABLE(CONFIG_HWMON) || !fbd->hwmon) - return; - - hwmon_device_unregister(fbd->hwmon); - fbd->hwmon = NULL; -} diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c index 80b82ff12c4d..7b654d0a6dac 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c @@ -686,27 +686,6 @@ fbnic_mac_get_eth_mac_stats(struct fbnic_dev *fbd, bool reset, MAC_STAT_TX_BROADCAST); } -static int fbnic_mac_get_sensor_asic(struct fbnic_dev *fbd, int id, long *val) -{ - struct fbnic_fw_completion fw_cmpl; - s32 *sensor; - - switch (id) { - case FBNIC_SENSOR_TEMP: - sensor = &fw_cmpl.tsene.millidegrees; - break; - case FBNIC_SENSOR_VOLTAGE: - sensor = &fw_cmpl.tsene.millivolts; - break; - default: - return -EINVAL; - } - - *val = *sensor; - - return 0; -} - static const struct fbnic_mac fbnic_mac_asic = { .init_regs = fbnic_mac_init_regs, .pcs_enable = fbnic_pcs_enable_asic, @@ -716,7 +695,6 @@ static const struct fbnic_mac fbnic_mac_asic = { .get_eth_mac_stats = fbnic_mac_get_eth_mac_stats, .link_down = fbnic_mac_link_down_asic, .link_up = fbnic_mac_link_up_asic, - .get_sensor = fbnic_mac_get_sensor_asic, }; /** diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h index 05a591653e09..476239a9d381 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h @@ -47,11 +47,6 @@ enum { #define FBNIC_LINK_MODE_PAM4 (FBNIC_LINK_50R1) #define FBNIC_LINK_MODE_MASK (FBNIC_LINK_AUTO - 1) -enum fbnic_sensor_id { - FBNIC_SENSOR_TEMP, /* Temp in millidegrees Centigrade */ - FBNIC_SENSOR_VOLTAGE, /* Voltage in millivolts */ -}; - /* This structure defines the interface hooks for the MAC. The MAC hooks * will be configured as a const struct provided with a set of function * pointers. @@ -88,8 +83,6 @@ struct fbnic_mac { void (*link_down)(struct fbnic_dev *fbd); void (*link_up)(struct fbnic_dev *fbd, bool tx_pause, bool rx_pause); - - int (*get_sensor)(struct fbnic_dev *fbd, int id, long *val); }; int fbnic_mac_init(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 32702dc4a066..7ccf192f13d5 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -296,8 +296,6 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Capture snapshot of hardware stats so netdev can calculate delta */ fbnic_reset_hw_stats(fbd); - fbnic_hwmon_register(fbd); - if (!fbd->dsn) { dev_warn(&pdev->dev, "Reading serial number failed\n"); goto init_failure_mode; @@ -360,7 +358,6 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_netdev_free(fbd); } - fbnic_hwmon_unregister(fbd); fbnic_dbg_fbd_exit(fbd); fbnic_devlink_unregister(fbd); fbnic_fw_disable_mbx(fbd); From b341ca51d2679829d26a3f6a4aa9aee9abd94f92 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sat, 4 Jan 2025 10:29:45 -0500 Subject: [PATCH 17/57] tls: Fix tls_sw_sendmsg error handling We've noticed that NFS can hang when using RPC over TLS on an unstable connection, and investigation shows that the RPC layer is stuck in a tight loop attempting to transmit, but forever getting -EBADMSG back from the underlying network. The loop begins when tcp_sendmsg_locked() returns -EPIPE to tls_tx_records(), but that error is converted to -EBADMSG when calling the socket's error reporting handler. Instead of converting errors from tcp_sendmsg_locked(), let's pass them along in this path. The RPC layer handles -EPIPE by reconnecting the transport, which prevents the endless attempts to transmit on a broken connection. Signed-off-by: Benjamin Coddington Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Link: https://patch.msgid.link/9594185559881679d81f071b181a10eb07cd079f.1736004079.git.bcodding@redhat.com Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbf26cc4f6ee..7bcc9b4408a2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -458,7 +458,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, -EBADMSG); + tls_err_abort(sk, rc); return rc; } From cb358ff94154774d031159b018adf45e17673941 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 6 Jan 2025 16:19:11 +0900 Subject: [PATCH 18/57] ipvlan: Fix use-after-free in ipvlan_get_iflink(). syzbot presented an use-after-free report [0] regarding ipvlan and linkwatch. ipvlan does not hold a refcnt of the lower device unlike vlan and macvlan. If the linkwatch work is triggered for the ipvlan dev, the lower dev might have already been freed, resulting in UAF of ipvlan->phy_dev in ipvlan_get_iflink(). We can delay the lower dev unregistration like vlan and macvlan by holding the lower dev's refcnt in dev->netdev_ops->ndo_init() and releasing it in dev->priv_destructor(). Jakub pointed out calling .ndo_XXX after unregister_netdevice() has returned is error prone and suggested [1] addressing this UAF in the core by taking commit 750e51603395 ("net: avoid potential UAF in default_operstate()") further. Let's assume unregistering devices DOWN and use RCU protection in default_operstate() not to race with the device unregistration. [0]: BUG: KASAN: slab-use-after-free in ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 Read of size 4 at addr ffff0000d768c0e0 by task kworker/u8:35/6944 CPU: 0 UID: 0 PID: 6944 Comm: kworker/u8:35 Not tainted 6.13.0-rc2-g9bc5c9515b48 #12 4c3cb9e8b4565456f6a355f312ff91f4f29b3c47 Hardware name: linux,dummy-virt (DT) Workqueue: events_unbound linkwatch_event Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:484 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load4_noabort+0x20/0x30 mm/kasan/report_generic.c:380 ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 dev_get_iflink+0x7c/0xd8 net/core/dev.c:674 default_operstate net/core/link_watch.c:45 [inline] rfc2863_policy+0x144/0x360 net/core/link_watch.c:72 linkwatch_do_dev+0x60/0x228 net/core/link_watch.c:175 __linkwatch_run_queue+0x2f4/0x5b8 net/core/link_watch.c:239 linkwatch_event+0x64/0xa8 net/core/link_watch.c:282 process_one_work+0x700/0x1398 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3391 kthread+0x2b0/0x360 kernel/kthread.c:389 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 Allocated by task 9303: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4283 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4289 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:650 alloc_netdev_mqs+0xb4/0x1118 net/core/dev.c:11209 rtnl_create_link+0x2b8/0xb60 net/core/rtnetlink.c:3595 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3771 __rtnl_newlink net/core/rtnetlink.c:3896 [inline] rtnl_newlink+0x122c/0x15c0 net/core/rtnetlink.c:4011 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] __sys_sendto+0x2ec/0x438 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __arm64_sys_sendto+0xe4/0x110 net/socket.c:2200 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 10200: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2338 [inline] slab_free mm/slub.c:4598 [inline] kfree+0x140/0x420 mm/slub.c:4746 kvfree+0x4c/0x68 mm/util.c:693 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2034 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xdd8/0xf48 net/core/dev.c:10924 rtnl_unlock net/core/rtnetlink.c:152 [inline] rtnl_net_unlock net/core/rtnetlink.c:209 [inline] rtnl_dellink+0x484/0x680 net/core/rtnetlink.c:3526 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] ____sys_sendmsg+0x410/0x708 net/socket.c:2583 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2672 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 The buggy address belongs to the object at ffff0000d768c000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 224 bytes inside of freed 4096-byte region [ffff0000d768c000, ffff0000d768d000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x117688 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff0000c77ef981 flags: 0xbfffe0000000040(head|node=0|zone=2|lastcpupid=0x1ffff) page_type: f5(slab) raw: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 head: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000003 fffffdffc35da201 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000d768bf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff0000d768c000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff0000d768c080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff0000d768c100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff0000d768c180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8c55facecd7a ("net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down") Reported-by: syzkaller Suggested-by: Jakub Kicinski Link: https://lore.kernel.org/netdev/20250102174400.085fd8ac@kernel.org/ [1] Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250106071911.64355-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/link_watch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b4d39e38084..cb04ef2b9807 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -42,14 +42,18 @@ static unsigned int default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ - if (dev->reg_state == NETREG_UNREGISTERED || - iflink == dev->ifindex) + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; + + if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); From db78475ba0d3c66d430f7ded2388cc041078a542 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 10:02:10 -0800 Subject: [PATCH 19/57] eth: gve: use appropriate helper to set xdp_features Commit f85949f98206 ("xdp: add xdp_set_features_flag utility routine") added routines to inform the core about XDP flag changes. GVE support was added around the same time and missed using them. GVE only changes the flags on error recover or resume. Presumably the flags may change during resume if VM migrated. User would not get the notification and upper devices would not get a chance to recalculate their flags. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Reviewed-By: Jeroen de Borst Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250106180210.1861784-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8a8f6ab12a98..533e659b15b3 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2241,14 +2241,18 @@ static void gve_service_task(struct work_struct *work) static void gve_set_netdev_xdp_features(struct gve_priv *priv) { + xdp_features_t xdp_features; + if (priv->queue_format == GVE_GQI_QPL_FORMAT) { - priv->dev->xdp_features = NETDEV_XDP_ACT_BASIC; - priv->dev->xdp_features |= NETDEV_XDP_ACT_REDIRECT; - priv->dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; - priv->dev->xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; + xdp_features = NETDEV_XDP_ACT_BASIC; + xdp_features |= NETDEV_XDP_ACT_REDIRECT; + xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; + xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; } else { - priv->dev->xdp_features = 0; + xdp_features = 0; } + + xdp_set_features_flag(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) From c2994b008492db033d40bd767be1620229a3035e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:09 -0500 Subject: [PATCH 20/57] Bluetooth: hci_sync: Fix not setting Random Address when required This fixes errors such as the following when Own address type is set to Random Address but it has not been programmed yet due to either be advertising or connecting: < HCI Command: LE Set Exte.. (0x08|0x0041) plen 13 Own address type: Random (0x03) Filter policy: Ignore not in accept list (0x01) PHYs: 0x05 Entry 0: LE 1M Type: Passive (0x00) Interval: 60.000 msec (0x0060) Window: 30.000 msec (0x0030) Entry 1: LE Coded Type: Passive (0x00) Interval: 180.000 msec (0x0120) Window: 90.000 msec (0x0090) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Exten.. (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Invalid HCI Command Parameters (0x12) Fixes: c45074d68a9b ("Bluetooth: Fix not generating RPA when required") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c86f4e42e69c..7b2b04d6b856 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1031,9 +1031,9 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the + /* If a random_addr has been set we're advertising or initiating an LE + * connection we can't go ahead and change the random address at this + * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). @@ -1041,8 +1041,9 @@ static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { + if (bacmp(&hdev->random_addr, BDADDR_ANY) && + (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; From a182d9c84f9c52fb5db895ecceeee8b3a1bf661e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:10 -0500 Subject: [PATCH 21/57] Bluetooth: MGMT: Fix Add Device to responding before completing Add Device with LE type requires updating resolving/accept list which requires quite a number of commands to complete and each of them may fail, so instead of pretending it would always work this checks the return of hci_update_passive_scan_sync which indicates if everything worked as intended. Fixes: e8907f76544f ("Bluetooth: hci_sync: Make use of hci_cmd_sync_queue set 3") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b31192d473d0..de47ad999d7b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7655,6 +7655,24 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } +static void add_device_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_add_device *cp = cmd->param; + + if (!err) { + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, + cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, + cp->addr.type, hdev->conn_flags, + PTR_UINT(cmd->user_data)); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, + mgmt_status(err), &cp->addr, sizeof(cp->addr)); + mgmt_pending_free(cmd); +} + static int add_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); @@ -7663,6 +7681,7 @@ static int add_device_sync(struct hci_dev *hdev, void *data) static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { + struct mgmt_pending_cmd *cmd; struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; struct hci_conn_params *params; @@ -7743,9 +7762,24 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, current_flags = params->flags; } - err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL); - if (err < 0) + cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); + if (!cmd) { + err = -ENOMEM; goto unlock; + } + + cmd->user_data = UINT_PTR(current_flags); + + err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, + add_device_complete); + if (err < 0) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); + mgmt_pending_free(cmd); + } + + goto unlock; added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); From 8023dd2204254a70887f5ee58d914bf70a060b9d Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Fri, 20 Dec 2024 18:32:52 +0530 Subject: [PATCH 22/57] Bluetooth: btnxpuart: Fix driver sending truncated data This fixes the apparent controller hang issue seen during stress test where the host sends a truncated payload, followed by HCI commands. The controller treats these HCI commands as a part of previously truncated payload, leading to command timeouts. Adding a serdev_device_wait_until_sent() call after serdev_device_write_buf() fixed the issue. Fixes: 689ca16e5232 ("Bluetooth: NXP: Add protocol support for NXP Bluetooth chipsets") Signed-off-by: Neeraj Sanjay Kale Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 569f5b7d6e46..1230045d78a5 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -1381,6 +1381,7 @@ static void btnxpuart_tx_work(struct work_struct *work) while ((skb = nxp_dequeue(nxpdev))) { len = serdev_device_write_buf(serdev, skb->data, skb->len); + serdev_device_wait_until_sent(serdev, 0); hdev->stat.byte_tx += len; skb_pull(skb, len); From 67dba2c28fe0af7e25ea1aeade677162ed05310a Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 8 Jan 2025 17:50:28 +0800 Subject: [PATCH 23/57] Bluetooth: btmtk: Fix failed to send func ctrl for MediaTek devices. Use usb_autopm_get_interface() and usb_autopm_put_interface() in btmtk_usb_shutdown(), it could send func ctrl after enabling autosuspend. Bluetooth: btmtk_usb_hci_wmt_sync() hci0: Execution of wmt command timed out Bluetooth: btmtk_usb_shutdown() hci0: Failed to send wmt func ctrl (-110) Fixes: 5c5e8c52e3ca ("Bluetooth: btmtk: move btusb_mtk_[setup, shutdown] to btmtk.c") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 7 +++++++ net/bluetooth/rfcomm/tty.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 7fd9d5ddce02..224eafc27dbe 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1472,10 +1472,15 @@ EXPORT_SYMBOL_GPL(btmtk_usb_setup); int btmtk_usb_shutdown(struct hci_dev *hdev) { + struct btmtk_data *data = hci_get_priv(hdev); struct btmtk_hci_wmt_params wmt_params; u8 param = 0; int err; + err = usb_autopm_get_interface(data->intf); + if (err < 0) + return err; + /* Disable the device */ wmt_params.op = BTMTK_WMT_FUNC_CTRL; wmt_params.flag = 0; @@ -1486,9 +1491,11 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) err = btmtk_usb_hci_wmt_sync(hdev, &wmt_params); if (err < 0) { bt_dev_err(hdev, "Failed to send wmt func ctrl (%d)", err); + usb_autopm_put_interface(data->intf); return err; } + usb_autopm_put_interface(data->intf); return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index af80d599c337..21a5b5535ebc 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -201,14 +201,14 @@ static ssize_t address_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%pMR\n", &dev->dst); + return sysfs_emit(buf, "%pMR\n", &dev->dst); } static ssize_t channel_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%d\n", dev->channel); + return sysfs_emit(buf, "%d\n", dev->channel); } static DEVICE_ATTR_RO(address); From d1cacd74776895f6435941f86a1130e58f6dd226 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 10:01:36 -0800 Subject: [PATCH 24/57] netdev: prevent accessing NAPI instances from another namespace The NAPI IDs were not fully exposed to user space prior to the netlink API, so they were never namespaced. The netlink API must ensure that at the very least NAPI instance belongs to the same netns as the owner of the genl sock. napi_by_id() can become static now, but it needs to move because of dev_get_by_napi_id(). Cc: stable@vger.kernel.org Fixes: 1287c1ae0fc2 ("netdev-genl: Support setting per-NAPI config values") Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Sridhar Samudrala Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250106180137.1861472-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 43 +++++++++++++++++++++++++++++------------- net/core/dev.h | 3 ++- net/core/netdev-genl.c | 6 ++---- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index faa23042df38..a9f62f5aeb84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -753,6 +753,36 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -6293,19 +6323,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) -{ - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; - - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; - - return NULL; -} - static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; diff --git a/net/core/dev.h b/net/core/dev.h index d043dee25a68..deb5eae5749f 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -22,6 +22,8 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id); + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -269,7 +271,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif -struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 125b660004d3..a3bdaf075b6b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -167,8 +167,6 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, void *hdr; pid_t pid; - if (WARN_ON_ONCE(!napi->dev)) - return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; @@ -234,7 +232,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); } else { @@ -355,7 +353,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); } else { From 80fb40baba19e25a1b6f3ecff6fc5c0171806bde Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Jan 2025 11:14:39 +0100 Subject: [PATCH 25/57] tcp: Annotate data-race around sk->sk_mark in tcp_v4_send_reset This is a follow-up to 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark"). sk->sk_mark can be read and written without holding the socket lock. IPv6 equivalent is already covered with READ_ONCE() annotation in tcp_v6_send_response(). Fixes: 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark") Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/f459d1fc44f205e13f6d8bdca2c8bfb9902ffac9.1736244569.git.daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44db..c26f6c4b7bb4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -896,7 +896,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); From 5a4b584c67699a69981f0740618a144965a63237 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:36 +0800 Subject: [PATCH 26/57] net: hns3: fixed reset failure issues caused by the incorrect reset type When a reset type that is not supported by the driver is input, a reset pending flag bit of the HNAE3_NONE_RESET type is generated in reset_pending. The driver does not have a mechanism to clear this type of error. As a result, the driver considers that the reset is not complete. This patch provides a mechanism to clear the HNAE3_NONE_RESET flag and the parameter of hnae3_ae_ops.set_default_reset_request is verified. The error message: hns3 0000:39:01.0: cmd failed -16 hns3 0000:39:01.0: hclge device re-init failed, VF is disabled! hns3 0000:39:01.0: failed to reset VF stack hns3 0000:39:01.0: failed to reset VF(4) hns3 0000:39:01.0: prepare reset(2) wait done hns3 0000:39:01.0 eth4: already uninitialized Use the crash tool to view struct hclgevf_dev: struct hclgevf_dev { ... default_reset_request = 0x20, reset_level = HNAE3_NONE_RESET, reset_pending = 0x100, reset_type = HNAE3_NONE_RESET, ... }; Fixes: 720bd5837e37 ("net: hns3: add set_default_reset_request in the hnae3_ae_ops") Signed-off-by: Hao Lan Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-2-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../hisilicon/hns3/hns3pf/hclge_main.c | 33 ++++++++++++++-- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 38 ++++++++++++++++--- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 05942fa78b11..7d44dc777dc5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3574,6 +3574,17 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf, return ret; } +static void hclge_set_reset_pending(struct hclge_dev *hdev, + enum hnae3_reset_type reset_type) +{ + /* When an incorrect reset type is executed, the get_reset_level + * function generates the HNAE3_NONE_RESET flag. As a result, this + * type do not need to pending. + */ + if (reset_type != HNAE3_NONE_RESET) + set_bit(reset_type, &hdev->reset_pending); +} + static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg; @@ -3594,7 +3605,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) */ if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "IMP reset interrupt\n"); - set_bit(HNAE3_IMP_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_IMP_RESET); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B); hdev->rst_stats.imp_rst_cnt++; @@ -3604,7 +3615,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "global reset interrupt\n"); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); - set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_GLOBAL_RESET); *clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B); hdev->rst_stats.global_rst_cnt++; return HCLGE_VECTOR0_EVENT_RST; @@ -4052,7 +4063,7 @@ static void hclge_do_reset(struct hclge_dev *hdev) case HNAE3_FUNC_RESET: dev_info(&pdev->dev, "PF reset requested\n"); /* schedule again to check later */ - set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_FUNC_RESET); hclge_reset_task_schedule(hdev); break; default: @@ -4086,6 +4097,8 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, clear_bit(HNAE3_FLR_RESET, addr); } + clear_bit(HNAE3_NONE_RESET, addr); + if (hdev->reset_type != HNAE3_NONE_RESET && rst_level < hdev->reset_type) return HNAE3_NONE_RESET; @@ -4227,7 +4240,7 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev) return false; } else if (hdev->rst_stats.reset_fail_cnt < MAX_RESET_FAIL_CNT) { hdev->rst_stats.reset_fail_cnt++; - set_bit(hdev->reset_type, &hdev->reset_pending); + hclge_set_reset_pending(hdev, hdev->reset_type); dev_info(&hdev->pdev->dev, "re-schedule reset task(%u)\n", hdev->rst_stats.reset_fail_cnt); @@ -4470,8 +4483,20 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle) static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { +#define HCLGE_SUPPORT_RESET_TYPE \ + (BIT(HNAE3_FLR_RESET) | BIT(HNAE3_FUNC_RESET) | \ + BIT(HNAE3_GLOBAL_RESET) | BIT(HNAE3_IMP_RESET)) + struct hclge_dev *hdev = ae_dev->priv; + if (!(BIT(rst_type) & HCLGE_SUPPORT_RESET_TYPE)) { + /* To prevent reset triggered by hclge_reset_event */ + set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); + dev_warn(&hdev->pdev->dev, "unsupported reset type %d\n", + rst_type); + return; + } + set_bit(rst_type, &hdev->default_reset_request); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 2f6ffb88e700..fd0abe37fdd7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1393,6 +1393,17 @@ static int hclgevf_notify_roce_client(struct hclgevf_dev *hdev, return ret; } +static void hclgevf_set_reset_pending(struct hclgevf_dev *hdev, + enum hnae3_reset_type reset_type) +{ + /* When an incorrect reset type is executed, the get_reset_level + * function generates the HNAE3_NONE_RESET flag. As a result, this + * type do not need to pending. + */ + if (reset_type != HNAE3_NONE_RESET) + set_bit(reset_type, &hdev->reset_pending); +} + static int hclgevf_reset_wait(struct hclgevf_dev *hdev) { #define HCLGEVF_RESET_WAIT_US 20000 @@ -1542,7 +1553,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev) hdev->rst_stats.rst_fail_cnt); if (hdev->rst_stats.rst_fail_cnt < HCLGEVF_RESET_MAX_FAIL_CNT) - set_bit(hdev->reset_type, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, hdev->reset_type); if (hclgevf_is_reset_pending(hdev)) { set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); @@ -1662,6 +1673,8 @@ static enum hnae3_reset_type hclgevf_get_reset_level(unsigned long *addr) clear_bit(HNAE3_FLR_RESET, addr); } + clear_bit(HNAE3_NONE_RESET, addr); + return rst_level; } @@ -1671,14 +1684,15 @@ static void hclgevf_reset_event(struct pci_dev *pdev, struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); struct hclgevf_dev *hdev = ae_dev->priv; - dev_info(&hdev->pdev->dev, "received reset request from VF enet\n"); - if (hdev->default_reset_request) hdev->reset_level = hclgevf_get_reset_level(&hdev->default_reset_request); else hdev->reset_level = HNAE3_VF_FUNC_RESET; + dev_info(&hdev->pdev->dev, "received reset request from VF enet, reset level is %d\n", + hdev->reset_level); + /* reset of this VF requested */ set_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state); hclgevf_reset_task_schedule(hdev); @@ -1689,8 +1703,20 @@ static void hclgevf_reset_event(struct pci_dev *pdev, static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { +#define HCLGEVF_SUPPORT_RESET_TYPE \ + (BIT(HNAE3_VF_RESET) | BIT(HNAE3_VF_FUNC_RESET) | \ + BIT(HNAE3_VF_PF_FUNC_RESET) | BIT(HNAE3_VF_FULL_RESET) | \ + BIT(HNAE3_FLR_RESET) | BIT(HNAE3_VF_EXP_RESET)) + struct hclgevf_dev *hdev = ae_dev->priv; + if (!(BIT(rst_type) & HCLGEVF_SUPPORT_RESET_TYPE)) { + /* To prevent reset triggered by hclge_reset_event */ + set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); + dev_info(&hdev->pdev->dev, "unsupported reset type %d\n", + rst_type); + return; + } set_bit(rst_type, &hdev->default_reset_request); } @@ -1847,14 +1873,14 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) */ if (hdev->reset_attempts > HCLGEVF_MAX_RESET_ATTEMPTS_CNT) { /* prepare for full reset of stack + pcie interface */ - set_bit(HNAE3_VF_FULL_RESET, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, HNAE3_VF_FULL_RESET); /* "defer" schedule the reset task again */ set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } else { hdev->reset_attempts++; - set_bit(hdev->reset_level, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, hdev->reset_level); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } hclgevf_reset_task_schedule(hdev); @@ -1977,7 +2003,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING); dev_info(&hdev->pdev->dev, "receive reset interrupt 0x%x!\n", rst_ing_reg); - set_bit(HNAE3_VF_RESET, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, HNAE3_VF_RESET); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = ~(1U << HCLGEVF_VECTOR0_RST_INT_B); From ac1e2836fe294c2007ca81cf7006862c3bdf0510 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:37 +0800 Subject: [PATCH 27/57] net: hns3: fix missing features due to dev->features configuration too early Currently, the netdev->features is configured in hns3_nic_set_features. As a result, __netdev_update_features considers that there is no feature difference, and the procedures of the real features are missing. Fixes: 2a7556bb2b73 ("net: hns3: implement ndo_features_check ops for hns3 driver") Signed-off-by: Hao Lan Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106143642.539698-3-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 43377a7b2426..a7e3b22f641c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2452,7 +2452,6 @@ static int hns3_nic_set_features(struct net_device *netdev, return ret; } - netdev->features = features; return 0; } From 5191a8d3c2ab5bc01930ea3425e06a739af5b0e9 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:38 +0800 Subject: [PATCH 28/57] net: hns3: Resolved the issue that the debugfs query result is inconsistent. This patch modifies the implementation of debugfs: When the user process stops unexpectedly, not all data of the file system is read. In this case, the save_buf pointer is not released. When the user process is called next time, save_buf is used to copy the cached data to the user space. As a result, the queried data is stale. To solve this problem, this patch implements .open() and .release() handler for debugfs file_operations. moving allocation buffer and execution of the cmd to the .open() handler and freeing in to the .release() handler. Allocate separate buffer for each reader and associate the buffer with the file pointer. When different user read processes no longer share the buffer, the stale data problem is fixed. Fixes: 5e69ea7ee2a6 ("net: hns3: refactor the debugfs process") Signed-off-by: Hao Lan Signed-off-by: Guangwei Zhang Signed-off-by: Jijie Shao Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106143642.539698-4-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 - .../ethernet/hisilicon/hns3/hns3_debugfs.c | 96 ++++++------------- 2 files changed, 31 insertions(+), 68 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 710a8f9f2248..12ba380eb701 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -916,9 +916,6 @@ struct hnae3_handle { u8 netdev_flags; struct dentry *hnae3_dbgfs; - /* protects concurrent contention between debugfs commands */ - struct mutex dbgfs_lock; - char **dbgfs_buf; /* Network interface message level enabled bits */ u32 msg_enable; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 807eb3bbb11c..9bbece25552b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1260,69 +1260,55 @@ static int hns3_dbg_read_cmd(struct hns3_dbg_data *dbg_data, static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { - struct hns3_dbg_data *dbg_data = filp->private_data; + char *buf = filp->private_data; + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static int hns3_dbg_open(struct inode *inode, struct file *filp) +{ + struct hns3_dbg_data *dbg_data = inode->i_private; struct hnae3_handle *handle = dbg_data->handle; struct hns3_nic_priv *priv = handle->priv; - ssize_t size = 0; - char **save_buf; - char *read_buf; u32 index; + char *buf; int ret; + if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || + test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) + return -EBUSY; + ret = hns3_dbg_get_cmd_index(dbg_data, &index); if (ret) return ret; - mutex_lock(&handle->dbgfs_lock); - save_buf = &handle->dbgfs_buf[index]; + buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || - test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) { - ret = -EBUSY; - goto out; + ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, + buf, hns3_dbg_cmd[index].buf_len); + if (ret) { + kvfree(buf); + return ret; } - if (*save_buf) { - read_buf = *save_buf; - } else { - read_buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); - if (!read_buf) { - ret = -ENOMEM; - goto out; - } + filp->private_data = buf; + return 0; +} - /* save the buffer addr until the last read operation */ - *save_buf = read_buf; - - /* get data ready for the first time to read */ - ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, - read_buf, hns3_dbg_cmd[index].buf_len); - if (ret) - goto out; - } - - size = simple_read_from_buffer(buffer, count, ppos, read_buf, - strlen(read_buf)); - if (size > 0) { - mutex_unlock(&handle->dbgfs_lock); - return size; - } - -out: - /* free the buffer for the last read operation */ - if (*save_buf) { - kvfree(*save_buf); - *save_buf = NULL; - } - - mutex_unlock(&handle->dbgfs_lock); - return ret; +static int hns3_dbg_release(struct inode *inode, struct file *filp) +{ + kvfree(filp->private_data); + filp->private_data = NULL; + return 0; } static const struct file_operations hns3_dbg_fops = { .owner = THIS_MODULE, - .open = simple_open, + .open = hns3_dbg_open, .read = hns3_dbg_read, + .release = hns3_dbg_release, }; static int hns3_dbg_bd_file_init(struct hnae3_handle *handle, u32 cmd) @@ -1379,13 +1365,6 @@ int hns3_dbg_init(struct hnae3_handle *handle) int ret; u32 i; - handle->dbgfs_buf = devm_kcalloc(&handle->pdev->dev, - ARRAY_SIZE(hns3_dbg_cmd), - sizeof(*handle->dbgfs_buf), - GFP_KERNEL); - if (!handle->dbgfs_buf) - return -ENOMEM; - hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry = debugfs_create_dir(name, hns3_dbgfs_root); handle->hnae3_dbgfs = hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry; @@ -1395,8 +1374,6 @@ int hns3_dbg_init(struct hnae3_handle *handle) debugfs_create_dir(hns3_dbg_dentry[i].name, handle->hnae3_dbgfs); - mutex_init(&handle->dbgfs_lock); - for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) { if ((hns3_dbg_cmd[i].cmd == HNAE3_DBG_CMD_TM_NODES && ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2) || @@ -1425,24 +1402,13 @@ int hns3_dbg_init(struct hnae3_handle *handle) out: debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; - mutex_destroy(&handle->dbgfs_lock); return ret; } void hns3_dbg_uninit(struct hnae3_handle *handle) { - u32 i; - debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; - - for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) - if (handle->dbgfs_buf[i]) { - kvfree(handle->dbgfs_buf[i]); - handle->dbgfs_buf[i] = NULL; - } - - mutex_destroy(&handle->dbgfs_lock); } void hns3_dbg_register_debugfs(const char *debugfs_dir_name) From 98b1e3b27734139c76295754b6c317aa4df6d32e Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 6 Jan 2025 22:36:39 +0800 Subject: [PATCH 29/57] net: hns3: don't auto enable misc vector Currently, there is a time window between misc irq enabled and service task inited. If an interrupte is reported at this time, it will cause warning like below: [ 16.324639] Call trace: [ 16.324641] __queue_delayed_work+0xb8/0xe0 [ 16.324643] mod_delayed_work_on+0x78/0xd0 [ 16.324655] hclge_errhand_task_schedule+0x58/0x90 [hclge] [ 16.324662] hclge_misc_irq_handle+0x168/0x240 [hclge] [ 16.324666] __handle_irq_event_percpu+0x64/0x1e0 [ 16.324667] handle_irq_event+0x80/0x170 [ 16.324670] handle_fasteoi_edge_irq+0x110/0x2bc [ 16.324671] __handle_domain_irq+0x84/0xfc [ 16.324673] gic_handle_irq+0x88/0x2c0 [ 16.324674] el1_irq+0xb8/0x140 [ 16.324677] arch_cpu_idle+0x18/0x40 [ 16.324679] default_idle_call+0x5c/0x1bc [ 16.324682] cpuidle_idle_call+0x18c/0x1c4 [ 16.324684] do_idle+0x174/0x17c [ 16.324685] cpu_startup_entry+0x30/0x6c [ 16.324687] secondary_start_kernel+0x1a4/0x280 [ 16.324688] ---[ end trace 6aa0bff672a964aa ]--- So don't auto enable misc vector when request irq.. Fixes: 7be1b9f3e99f ("net: hns3: make hclge_service use delayed workqueue") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-5-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 7d44dc777dc5..db7845009252 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -3770,7 +3771,7 @@ static int hclge_misc_irq_init(struct hclge_dev *hdev) snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s", HCLGE_NAME, pci_name(hdev->pdev)); ret = request_irq(hdev->misc_vector.vector_irq, hclge_misc_irq_handle, - 0, hdev->misc_vector.name, hdev); + IRQF_NO_AUTOEN, hdev->misc_vector.name, hdev); if (ret) { hclge_free_vector(hdev, 0); dev_err(&hdev->pdev->dev, "request misc irq(%d) fail\n", @@ -11906,9 +11907,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_init_rxd_adv_layout(hdev); - /* Enable MISC vector(vector0) */ - hclge_enable_vector(&hdev->misc_vector, true); - ret = hclge_init_wol(hdev); if (ret) dev_warn(&pdev->dev, @@ -11921,6 +11919,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_state_init(hdev); hdev->last_reset_time = jiffies; + /* Enable MISC vector(vector0) */ + enable_irq(hdev->misc_vector.vector_irq); + hclge_enable_vector(&hdev->misc_vector, true); + dev_info(&hdev->pdev->dev, "%s driver initialization finished.\n", HCLGE_DRIVER_NAME); @@ -12326,7 +12328,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) /* Disable MISC vector(vector0) */ hclge_enable_vector(&hdev->misc_vector, false); - synchronize_irq(hdev->misc_vector.vector_irq); + disable_irq(hdev->misc_vector.vector_irq); /* Disable all hw interrupts */ hclge_config_mac_tnl_int(hdev, false); From 247fd1e33e1cd156aabe444e932d2648d33f1245 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 6 Jan 2025 22:36:40 +0800 Subject: [PATCH 30/57] net: hns3: initialize reset_timer before hclgevf_misc_irq_init() Currently the misc irq is initialized before reset_timer setup. But it will access the reset_timer in the irq handler. So initialize the reset_timer earlier. Fixes: ff200099d271 ("net: hns3: remove unnecessary work in hclgevf_main") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-6-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index fd0abe37fdd7..163c6e59ea4c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2313,6 +2313,8 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state); INIT_DELAYED_WORK(&hdev->service_task, hclgevf_service_task); + /* timer needs to be initialized before misc irq */ + timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); mutex_init(&hdev->mbx_resp.mbx_mutex); sema_init(&hdev->reset_sem, 1); @@ -3012,7 +3014,6 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) HCLGEVF_DRIVER_NAME); hclgevf_task_schedule(hdev, round_jiffies_relative(HZ)); - timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); return 0; From 7997ddd46c54408bcba5e37fe18b4d832e45d4d4 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:41 +0800 Subject: [PATCH 31/57] net: hns3: fixed hclge_fetch_pf_reg accesses bar space out of bounds issue The TQP BAR space is divided into two segments. TQPs 0-1023 and TQPs 1024-1279 are in different BAR space addresses. However, hclge_fetch_pf_reg does not distinguish the tqp space information when reading the tqp space information. When the number of TQPs is greater than 1024, access bar space overwriting occurs. The problem of different segments has been considered during the initialization of tqp.io_base. Therefore, tqp.io_base is directly used when the queue is read in hclge_fetch_pf_reg. The error message: Unable to handle kernel paging request at virtual address ffff800037200000 pc : hclge_fetch_pf_reg+0x138/0x250 [hclge] lr : hclge_get_regs+0x84/0x1d0 [hclge] Call trace: hclge_fetch_pf_reg+0x138/0x250 [hclge] hclge_get_regs+0x84/0x1d0 [hclge] hns3_get_regs+0x2c/0x50 [hns3] ethtool_get_regs+0xf4/0x270 dev_ethtool+0x674/0x8a0 dev_ioctl+0x270/0x36c sock_do_ioctl+0x110/0x2a0 sock_ioctl+0x2ac/0x530 __arm64_sys_ioctl+0xa8/0x100 invoke_syscall+0x4c/0x124 el0_svc_common.constprop.0+0x140/0x15c do_el0_svc+0x30/0xd0 el0_svc+0x1c/0x2c el0_sync_handler+0xb0/0xb4 el0_sync+0x168/0x180 Fixes: 939ccd107ffc ("net: hns3: move dump regs function to a separate file") Signed-off-by: Hao Lan Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-7-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c | 9 +++++---- .../net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c index 43c1c18fa81f..8c057192aae6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c @@ -510,9 +510,9 @@ out: static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, struct hnae3_knic_private_info *kinfo) { -#define HCLGE_RING_REG_OFFSET 0x200 #define HCLGE_RING_INT_REG_OFFSET 0x4 + struct hnae3_queue *tqp; int i, j, reg_num; int data_num_sum; u32 *reg = data; @@ -533,10 +533,11 @@ static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, reg_num = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < kinfo->num_tqps; j++) { reg += hclge_reg_get_tlv(HCLGE_REG_TAG_RING, reg_num, reg); + tqp = kinfo->tqp[j]; for (i = 0; i < reg_num; i++) - *reg++ = hclge_read_dev(&hdev->hw, - ring_reg_addr_list[i] + - HCLGE_RING_REG_OFFSET * j); + *reg++ = readl_relaxed(tqp->io_base - + HCLGE_TQP_REG_OFFSET + + ring_reg_addr_list[i]); } data_num_sum += (reg_num + HCLGE_REG_TLV_SPACE) * kinfo->num_tqps; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c index 6db415d8b917..7d9d9dbc7560 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c @@ -123,10 +123,10 @@ int hclgevf_get_regs_len(struct hnae3_handle *handle) void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, void *data) { -#define HCLGEVF_RING_REG_OFFSET 0x200 #define HCLGEVF_RING_INT_REG_OFFSET 0x4 struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + struct hnae3_queue *tqp; int i, j, reg_um; u32 *reg = data; @@ -147,10 +147,11 @@ void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, reg_um = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < hdev->num_tqps; j++) { reg += hclgevf_reg_get_tlv(HCLGEVF_REG_TAG_RING, reg_um, reg); + tqp = &hdev->htqp[j].q; for (i = 0; i < reg_um; i++) - *reg++ = hclgevf_read_dev(&hdev->hw, - ring_reg_addr_list[i] + - HCLGEVF_RING_REG_OFFSET * j); + *reg++ = readl_relaxed(tqp->io_base - + HCLGEVF_TQP_REG_OFFSET + + ring_reg_addr_list[i]); } reg_um = ARRAY_SIZE(tqp_intr_reg_addr_list); From 9741e72b2286de8b38de9db685588ac421a95c87 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 6 Jan 2025 22:36:42 +0800 Subject: [PATCH 32/57] net: hns3: fix kernel crash when 1588 is sent on HIP08 devices Currently, HIP08 devices does not register the ptp devices, so the hdev->ptp is NULL. But the tx process would still try to set hardware time stamp info with SKBTX_HW_TSTAMP flag and cause a kernel crash. [ 128.087798] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000018 ... [ 128.280251] pc : hclge_ptp_set_tx_info+0x2c/0x140 [hclge] [ 128.286600] lr : hclge_ptp_set_tx_info+0x20/0x140 [hclge] [ 128.292938] sp : ffff800059b93140 [ 128.297200] x29: ffff800059b93140 x28: 0000000000003280 [ 128.303455] x27: ffff800020d48280 x26: ffff0cb9dc814080 [ 128.309715] x25: ffff0cb9cde93fa0 x24: 0000000000000001 [ 128.315969] x23: 0000000000000000 x22: 0000000000000194 [ 128.322219] x21: ffff0cd94f986000 x20: 0000000000000000 [ 128.328462] x19: ffff0cb9d2a166c0 x18: 0000000000000000 [ 128.334698] x17: 0000000000000000 x16: ffffcf1fc523ed24 [ 128.340934] x15: 0000ffffd530a518 x14: 0000000000000000 [ 128.347162] x13: ffff0cd6bdb31310 x12: 0000000000000368 [ 128.353388] x11: ffff0cb9cfbc7070 x10: ffff2cf55dd11e02 [ 128.359606] x9 : ffffcf1f85a212b4 x8 : ffff0cd7cf27dab0 [ 128.365831] x7 : 0000000000000a20 x6 : ffff0cd7cf27d000 [ 128.372040] x5 : 0000000000000000 x4 : 000000000000ffff [ 128.378243] x3 : 0000000000000400 x2 : ffffcf1f85a21294 [ 128.384437] x1 : ffff0cb9db520080 x0 : ffff0cb9db500080 [ 128.390626] Call trace: [ 128.393964] hclge_ptp_set_tx_info+0x2c/0x140 [hclge] [ 128.399893] hns3_nic_net_xmit+0x39c/0x4c4 [hns3] [ 128.405468] xmit_one.constprop.0+0xc4/0x200 [ 128.410600] dev_hard_start_xmit+0x54/0xf0 [ 128.415556] sch_direct_xmit+0xe8/0x634 [ 128.420246] __dev_queue_xmit+0x224/0xc70 [ 128.425101] dev_queue_xmit+0x1c/0x40 [ 128.429608] ovs_vport_send+0xac/0x1a0 [openvswitch] [ 128.435409] do_output+0x60/0x17c [openvswitch] [ 128.440770] do_execute_actions+0x898/0x8c4 [openvswitch] [ 128.446993] ovs_execute_actions+0x64/0xf0 [openvswitch] [ 128.453129] ovs_dp_process_packet+0xa0/0x224 [openvswitch] [ 128.459530] ovs_vport_receive+0x7c/0xfc [openvswitch] [ 128.465497] internal_dev_xmit+0x34/0xb0 [openvswitch] [ 128.471460] xmit_one.constprop.0+0xc4/0x200 [ 128.476561] dev_hard_start_xmit+0x54/0xf0 [ 128.481489] __dev_queue_xmit+0x968/0xc70 [ 128.486330] dev_queue_xmit+0x1c/0x40 [ 128.490856] ip_finish_output2+0x250/0x570 [ 128.495810] __ip_finish_output+0x170/0x1e0 [ 128.500832] ip_finish_output+0x3c/0xf0 [ 128.505504] ip_output+0xbc/0x160 [ 128.509654] ip_send_skb+0x58/0xd4 [ 128.513892] udp_send_skb+0x12c/0x354 [ 128.518387] udp_sendmsg+0x7a8/0x9c0 [ 128.522793] inet_sendmsg+0x4c/0x8c [ 128.527116] __sock_sendmsg+0x48/0x80 [ 128.531609] __sys_sendto+0x124/0x164 [ 128.536099] __arm64_sys_sendto+0x30/0x5c [ 128.540935] invoke_syscall+0x50/0x130 [ 128.545508] el0_svc_common.constprop.0+0x10c/0x124 [ 128.551205] do_el0_svc+0x34/0xdc [ 128.555347] el0_svc+0x20/0x30 [ 128.559227] el0_sync_handler+0xb8/0xc0 [ 128.563883] el0_sync+0x160/0x180 Fixes: 0bf5eb788512 ("net: hns3: add support for PTP") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106143642.539698-8-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index 5505caea88e9..bab16c2191b2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -58,6 +58,9 @@ bool hclge_ptp_set_tx_info(struct hnae3_handle *handle, struct sk_buff *skb) struct hclge_dev *hdev = vport->back; struct hclge_ptp *ptp = hdev->ptp; + if (!ptp) + return false; + if (!test_bit(HCLGE_PTP_FLAG_TX_EN, &ptp->flags) || test_and_set_bit(HCLGE_STATE_PTP_TX_HANDLING, &hdev->state)) { ptp->tx_skipped++; From d1bf27c4e1768d4733143f26962a5c68ea8bd03c Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 7 Jan 2025 15:26:59 +0100 Subject: [PATCH 33/57] dt-bindings: net: pse-pd: Fix unusual character in documentation The documentation contained an unusual character due to an issue in my personal b4 setup. Fix the problem by providing the correct PSE Pinout Alternatives table number description. Signed-off-by: Kory Maincent Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250107142659.425877-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/pse-pd/pse-controller.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml index a12cda8aa764..cd09560e0aea 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml @@ -81,7 +81,7 @@ properties: List of phandles, each pointing to the power supply for the corresponding pairset named in 'pairset-names'. This property aligns with IEEE 802.3-2022, Section 33.2.3 and 145.2.4. - PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145\u20133) + PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145-3) |-----------|---------------|---------------|---------------|---------------| | Conductor | Alternative A | Alternative A | Alternative B | Alternative B | | | (MDI-X) | (MDI) | (X) | (S) | From 2d2d4f60ed266a8f340a721102d035252606980b Mon Sep 17 00:00:00 2001 From: Leo Yang Date: Tue, 7 Jan 2025 11:15:30 +0800 Subject: [PATCH 34/57] mctp i3c: fix MCTP I3C driver multi-thread issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found a timeout problem with the pldm command on our system. The reason is that the MCTP-I3C driver has a race condition when receiving multiple-packet messages in multi-thread, resulting in a wrong packet order problem. We identified this problem by adding a debug message to the mctp_i3c_read function. According to the MCTP spec, a multiple-packet message must be composed in sequence, and if there is a wrong sequence, the whole message will be discarded and wait for the next SOM. For example, SOM → Pkt Seq #2 → Pkt Seq #1 → Pkt Seq #3 → EOM. Therefore, we try to solve this problem by adding a mutex to the mctp_i3c_read function. Before the modification, when a command requesting a multiple-packet message response is sent consecutively, an error usually occurs within 100 loops. After the mutex, it can go through 40000 loops without any error, and it seems to run well. Fixes: c8755b29b58e ("mctp i3c: MCTP I3C driver") Signed-off-by: Leo Yang Link: https://patch.msgid.link/20250107031529.3296094-1-Leo-Yang@quantatw.com [pabeni@redhat.com: dropped already answered question from changelog] Signed-off-by: Paolo Abeni --- drivers/net/mctp/mctp-i3c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index 9adad59b8676..d247fe483c58 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -125,6 +125,8 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi) xfer.data.in = skb_put(skb, mi->mrl); + /* Make sure netif_rx() is read in the same order as i3c. */ + mutex_lock(&mi->lock); rc = i3c_device_do_priv_xfers(mi->i3c, &xfer, 1); if (rc < 0) goto err; @@ -166,8 +168,10 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi) stats->rx_dropped++; } + mutex_unlock(&mi->lock); return 0; err: + mutex_unlock(&mi->lock); kfree_skb(skb); return rc; } From 13210fc63f353fe78584048079343413a3cdf819 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Jan 2025 13:01:13 +0100 Subject: [PATCH 35/57] netfilter: nf_tables: imbalance in flowtable binding All these cases cause imbalance between BIND and UNBIND calls: - Delete an interface from a flowtable with multiple interfaces - Add a (device to a) flowtable with --check flag - Delete a netns containing a flowtable - In an interactive nft session, create a table with owner flag and flowtable inside, then quit. Fix it by calling FLOW_BLOCK_UNBIND when unregistering hooks, then remove late FLOW_BLOCK_UNBIND call when destroying flowtable. Fixes: ff4bf2f42a40 ("netfilter: nf_tables: add nft_unregister_flowtable_hook()") Reported-by: Phil Sutter Tested-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0b9f1e8dfe49..c4af283356e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8822,6 +8822,7 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { @@ -8829,6 +8830,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); @@ -8837,9 +8840,10 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -9481,8 +9485,6 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -10870,6 +10872,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); @@ -10878,6 +10881,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11140,11 +11144,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11737,7 +11743,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } From b541ba7d1f5a5b7b3e2e22dc9e40e18a7d6dbc13 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Jan 2025 22:56:33 +0100 Subject: [PATCH 36/57] netfilter: conntrack: clamp maximum hashtable size to INT_MAX Use INT_MAX as maximum size for the conntrack hashtable. Otherwise, it is possible to hit WARN_ON_ONCE in __kvmalloc_node_noprof() when resizing hashtable because __GFP_NOWARN is unset. See: 0708a0afe291 ("mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls") Note: hashtable resize is only possible from init_netns. Fixes: 9cc1c73ad666 ("netfilter: conntrack: avoid integer overflow when resizing") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9db3e2b0b1c3..456446d7af20 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2517,12 +2517,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) From 737d4d91d35b5f7fa5bb442651472277318b0bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 7 Jan 2025 13:01:05 +0100 Subject: [PATCH 37/57] sched: sch_cake: add bounds checks to host bulk flow fairness counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though we fixed a logic error in the commit cited below, syzbot still managed to trigger an underflow of the per-host bulk flow counters, leading to an out of bounds memory access. To avoid any such logic errors causing out of bounds memory accesses, this commit factors out all accesses to the per-host bulk flow counters to a series of helpers that perform bounds-checking before any increments and decrements. This also has the benefit of improving readability by moving the conditional checks for the flow mode into these helpers, instead of having them spread out throughout the code (which was the cause of the original logic error). As part of this change, the flow quantum calculation is consolidated into a helper function, which means that the dithering applied to the ost load scaling is now applied both in the DRR rotation and when a sparse flow's quantum is first initiated. The only user-visible effect of this is that the maximum packet size that can be sent while a flow stays sparse will now vary with +/- one byte in some cases. This should not make a noticeable difference in practice, and thus it's not worth complicating the code to preserve the old behaviour. Fixes: 546ea84d07e3 ("sched: sch_cake: fix bulk flow accounting logic for host fairness") Reported-by: syzbot+f63600d288bfb7057424@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Dave Taht Link: https://patch.msgid.link/20250107120105.70685-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 140 +++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 8d8b2db4653c..2c2e2a67f3b2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -627,6 +627,63 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { @@ -773,10 +830,8 @@ skip_hash: allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - if (allocate_src) - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - if (allocate_dst) - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ @@ -801,9 +856,10 @@ found: q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -824,9 +880,10 @@ found_src: q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -1839,10 +1896,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1852,18 +1905,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ @@ -1871,12 +1914,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; - + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1933,13 +1972,11 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -2039,11 +2076,6 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying @@ -2055,11 +2087,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2071,19 +2100,7 @@ retry: } } - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - WARN_ON(host_load > CAKE_QUEUES); - - /* The get_random_u16() is a way to apply dithering to avoid - * accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - get_random_u16()) >> 16; + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2107,11 +2124,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2129,12 +2143,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; - + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; From 426046e2d62dd19533808661e912b8e8a9eaec16 Mon Sep 17 00:00:00 2001 From: Parker Newman Date: Tue, 7 Jan 2025 16:24:59 -0500 Subject: [PATCH 38/57] net: stmmac: dwmac-tegra: Read iommu stream id from device tree Nvidia's Tegra MGBE controllers require the IOMMU "Stream ID" (SID) to be written to the MGBE_WRAP_AXI_ASID0_CTRL register. The current driver is hard coded to use MGBE0's SID for all controllers. This causes softirq time outs and kernel panics when using controllers other than MGBE0. Example dmesg errors when an ethernet cable is connected to MGBE1: [ 116.133290] tegra-mgbe 6910000.ethernet eth1: Link is Up - 1Gbps/Full - flow control rx/tx [ 121.851283] tegra-mgbe 6910000.ethernet eth1: NETDEV WATCHDOG: CPU: 5: transmit queue 0 timed out 5690 ms [ 121.851782] tegra-mgbe 6910000.ethernet eth1: Reset adapter. [ 121.892464] tegra-mgbe 6910000.ethernet eth1: Register MEM_TYPE_PAGE_POOL RxQ-0 [ 121.905920] tegra-mgbe 6910000.ethernet eth1: PHY [stmmac-1:00] driver [Aquantia AQR113] (irq=171) [ 121.907356] tegra-mgbe 6910000.ethernet eth1: Enabling Safety Features [ 121.907578] tegra-mgbe 6910000.ethernet eth1: IEEE 1588-2008 Advanced Timestamp supported [ 121.908399] tegra-mgbe 6910000.ethernet eth1: registered PTP clock [ 121.908582] tegra-mgbe 6910000.ethernet eth1: configuring for phy/10gbase-r link mode [ 125.961292] tegra-mgbe 6910000.ethernet eth1: Link is Up - 1Gbps/Full - flow control rx/tx [ 181.921198] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: [ 181.921404] rcu: 7-....: (1 GPs behind) idle=540c/1/0x4000000000000002 softirq=1748/1749 fqs=2337 [ 181.921684] rcu: (detected by 4, t=6002 jiffies, g=1357, q=1254 ncpus=8) [ 181.921878] Sending NMI from CPU 4 to CPUs 7: [ 181.921886] NMI backtrace for cpu 7 [ 181.922131] CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Kdump: loaded Not tainted 6.13.0-rc3+ #6 [ 181.922390] Hardware name: NVIDIA CTI Forge + Orin AGX/Jetson, BIOS 202402.1-Unknown 10/28/2024 [ 181.922658] pstate: 40400009 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 181.922847] pc : handle_softirqs+0x98/0x368 [ 181.922978] lr : __do_softirq+0x18/0x20 [ 181.923095] sp : ffff80008003bf50 [ 181.923189] x29: ffff80008003bf50 x28: 0000000000000008 x27: 0000000000000000 [ 181.923379] x26: ffffce78ea277000 x25: 0000000000000000 x24: 0000001c61befda0 [ 181.924486] x23: 0000000060400009 x22: ffffce78e99918bc x21: ffff80008018bd70 [ 181.925568] x20: ffffce78e8bb00d8 x19: ffff80008018bc20 x18: 0000000000000000 [ 181.926655] x17: ffff318ebe7d3000 x16: ffff800080038000 x15: 0000000000000000 [ 181.931455] x14: ffff000080816680 x13: ffff318ebe7d3000 x12: 000000003464d91d [ 181.938628] x11: 0000000000000040 x10: ffff000080165a70 x9 : ffffce78e8bb0160 [ 181.945804] x8 : ffff8000827b3160 x7 : f9157b241586f343 x6 : eeb6502a01c81c74 [ 181.953068] x5 : a4acfcdd2e8096bb x4 : ffffce78ea277340 x3 : 00000000ffffd1e1 [ 181.960329] x2 : 0000000000000101 x1 : ffffce78ea277340 x0 : ffff318ebe7d3000 [ 181.967591] Call trace: [ 181.970043] handle_softirqs+0x98/0x368 (P) [ 181.974240] __do_softirq+0x18/0x20 [ 181.977743] ____do_softirq+0x14/0x28 [ 181.981415] call_on_irq_stack+0x24/0x30 [ 181.985180] do_softirq_own_stack+0x20/0x30 [ 181.989379] __irq_exit_rcu+0x114/0x140 [ 181.993142] irq_exit_rcu+0x14/0x28 [ 181.996816] el1_interrupt+0x44/0xb8 [ 182.000316] el1h_64_irq_handler+0x14/0x20 [ 182.004343] el1h_64_irq+0x80/0x88 [ 182.007755] cpuidle_enter_state+0xc4/0x4a8 (P) [ 182.012305] cpuidle_enter+0x3c/0x58 [ 182.015980] cpuidle_idle_call+0x128/0x1c0 [ 182.020005] do_idle+0xe0/0xf0 [ 182.023155] cpu_startup_entry+0x3c/0x48 [ 182.026917] secondary_start_kernel+0xdc/0x120 [ 182.031379] __secondary_switched+0x74/0x78 [ 212.971162] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 7-.... } 6103 jiffies s: 417 root: 0x80/. [ 212.985935] rcu: blocking rcu_node structures (internal RCU debug): [ 212.992758] Sending NMI from CPU 0 to CPUs 7: [ 212.998539] NMI backtrace for cpu 7 [ 213.004304] CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Kdump: loaded Not tainted 6.13.0-rc3+ #6 [ 213.016116] Hardware name: NVIDIA CTI Forge + Orin AGX/Jetson, BIOS 202402.1-Unknown 10/28/2024 [ 213.030817] pstate: 40400009 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 213.040528] pc : handle_softirqs+0x98/0x368 [ 213.046563] lr : __do_softirq+0x18/0x20 [ 213.051293] sp : ffff80008003bf50 [ 213.055839] x29: ffff80008003bf50 x28: 0000000000000008 x27: 0000000000000000 [ 213.067304] x26: ffffce78ea277000 x25: 0000000000000000 x24: 0000001c61befda0 [ 213.077014] x23: 0000000060400009 x22: ffffce78e99918bc x21: ffff80008018bd70 [ 213.087339] x20: ffffce78e8bb00d8 x19: ffff80008018bc20 x18: 0000000000000000 [ 213.097313] x17: ffff318ebe7d3000 x16: ffff800080038000 x15: 0000000000000000 [ 213.107201] x14: ffff000080816680 x13: ffff318ebe7d3000 x12: 000000003464d91d [ 213.116651] x11: 0000000000000040 x10: ffff000080165a70 x9 : ffffce78e8bb0160 [ 213.127500] x8 : ffff8000827b3160 x7 : 0a37b344852820af x6 : 3f049caedd1ff608 [ 213.138002] x5 : cff7cfdbfaf31291 x4 : ffffce78ea277340 x3 : 00000000ffffde04 [ 213.150428] x2 : 0000000000000101 x1 : ffffce78ea277340 x0 : ffff318ebe7d3000 [ 213.162063] Call trace: [ 213.165494] handle_softirqs+0x98/0x368 (P) [ 213.171256] __do_softirq+0x18/0x20 [ 213.177291] ____do_softirq+0x14/0x28 [ 213.182017] call_on_irq_stack+0x24/0x30 [ 213.186565] do_softirq_own_stack+0x20/0x30 [ 213.191815] __irq_exit_rcu+0x114/0x140 [ 213.196891] irq_exit_rcu+0x14/0x28 [ 213.202401] el1_interrupt+0x44/0xb8 [ 213.207741] el1h_64_irq_handler+0x14/0x20 [ 213.213519] el1h_64_irq+0x80/0x88 [ 213.217541] cpuidle_enter_state+0xc4/0x4a8 (P) [ 213.224364] cpuidle_enter+0x3c/0x58 [ 213.228653] cpuidle_idle_call+0x128/0x1c0 [ 213.233993] do_idle+0xe0/0xf0 [ 213.237928] cpu_startup_entry+0x3c/0x48 [ 213.243791] secondary_start_kernel+0xdc/0x120 [ 213.249830] __secondary_switched+0x74/0x78 This bug has existed since the dwmac-tegra driver was added in Dec 2022 (See Fixes tag below for commit hash). The Tegra234 SOC has 4 MGBE controllers, however Nvidia's Developer Kit only uses MGBE0 which is why the bug was not found previously. Connect Tech has many products that use 2 (or more) MGBE controllers. The solution is to read the controller's SID from the existing "iommus" device tree property. The 2nd field of the "iommus" device tree property is the controller's SID. Device tree snippet from tegra234.dtsi showing MGBE1's "iommus" property: smmu_niso0: iommu@12000000 { compatible = "nvidia,tegra234-smmu", "nvidia,smmu-500"; ... } /* MGBE1 */ ethernet@6900000 { compatible = "nvidia,tegra234-mgbe"; ... iommus = <&smmu_niso0 TEGRA234_SID_MGBE_VF1>; ... } Nvidia's arm-smmu driver reads the "iommus" property and stores the SID in the MGBE device's "fwspec" struct. The dwmac-tegra driver can access the SID using the tegra_dev_iommu_get_stream_id() helper function found in linux/iommu.h. Calling tegra_dev_iommu_get_stream_id() should not fail unless the "iommus" property is removed from the device tree or the IOMMU is disabled. While the Tegra234 SOC technically supports bypassing the IOMMU, it is not supported by the current firmware, has not been tested and not recommended. More detailed discussion with Thierry Reding from Nvidia linked below. Fixes: d8ca113724e7 ("net: stmmac: tegra: Add MGBE support") Link: https://lore.kernel.org/netdev/cover.1731685185.git.pnewman@connecttech.com Signed-off-by: Parker Newman Reviewed-by: Andrew Lunn Acked-by: Thierry Reding Link: https://patch.msgid.link/6fb97f32cf4accb4f7cf92846f6b60064ba0a3bd.1736284360.git.pnewman@connecttech.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index 3827997d2132..dc903b846b1b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -19,6 +20,8 @@ struct tegra_mgbe { struct reset_control *rst_mac; struct reset_control *rst_pcs; + u32 iommu_sid; + void __iomem *hv; void __iomem *regs; void __iomem *xpcs; @@ -50,7 +53,6 @@ struct tegra_mgbe { #define MGBE_WRAP_COMMON_INTR_ENABLE 0x8704 #define MAC_SBD_INTR BIT(2) #define MGBE_WRAP_AXI_ASID0_CTRL 0x8400 -#define MGBE_SID 0x6 static int __maybe_unused tegra_mgbe_suspend(struct device *dev) { @@ -84,7 +86,7 @@ static int __maybe_unused tegra_mgbe_resume(struct device *dev) writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE); /* Program SID */ - writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); + writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_STATUS); if ((value & XPCS_WRAP_UPHY_STATUS_TX_P_UP) == 0) { @@ -241,6 +243,12 @@ static int tegra_mgbe_probe(struct platform_device *pdev) if (IS_ERR(mgbe->xpcs)) return PTR_ERR(mgbe->xpcs); + /* get controller's stream id from iommu property in device tree */ + if (!tegra_dev_iommu_get_stream_id(mgbe->dev, &mgbe->iommu_sid)) { + dev_err(mgbe->dev, "failed to get iommu stream id\n"); + return -EINVAL; + } + res.addr = mgbe->regs; res.irq = irq; @@ -346,7 +354,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev) writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE); /* Program SID */ - writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); + writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); plat->flags |= STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP; From 2055272e3ae01a954e41a5afb437c5d76f758e0b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jan 2025 12:15:53 +0300 Subject: [PATCH 39/57] rtase: Fix a check for error in rtase_alloc_msix() The pci_irq_vector() function never returns zero. It returns negative error codes or a positive non-zero IRQ number. Fix the error checking to test for negatives. Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Link: https://patch.msgid.link/f2ecc88d-af13-4651-9820-7cc665230019@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index de7f11232593..c42c0516656b 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1827,7 +1827,7 @@ static int rtase_alloc_msix(struct pci_dev *pdev, struct rtase_private *tp) for (i = 0; i < tp->int_nums; i++) { irq = pci_irq_vector(pdev, i); - if (!irq) { + if (irq < 0) { pci_disable_msix(pdev); return irq; } From 0e2909c6bec9048f49d0c8e16887c63b50b14647 Mon Sep 17 00:00:00 2001 From: Chenguang Zhao Date: Wed, 8 Jan 2025 11:00:09 +0800 Subject: [PATCH 40/57] net/mlx5: Fix variable not being completed when function returns When cmd_alloc_index(), fails cmd_work_handler() needs to complete ent->slotted before returning early. Otherwise the task which issued the command may hang: mlx5_core 0000:01:00.0: cmd_work_handler:877:(pid 3880418): failed to allocate command entry INFO: task kworker/13:2:4055883 blocked for more than 120 seconds. Not tainted 4.19.90-25.44.v2101.ky10.aarch64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/13:2 D 0 4055883 2 0x00000228 Workqueue: events mlx5e_tx_dim_work [mlx5_core] Call trace: __switch_to+0xe8/0x150 __schedule+0x2a8/0x9b8 schedule+0x2c/0x88 schedule_timeout+0x204/0x478 wait_for_common+0x154/0x250 wait_for_completion+0x28/0x38 cmd_exec+0x7a0/0xa00 [mlx5_core] mlx5_cmd_exec+0x54/0x80 [mlx5_core] mlx5_core_modify_cq+0x6c/0x80 [mlx5_core] mlx5_core_modify_cq_moderation+0xa0/0xb8 [mlx5_core] mlx5e_tx_dim_work+0x54/0x68 [mlx5_core] process_one_work+0x1b0/0x448 worker_thread+0x54/0x468 kthread+0x134/0x138 ret_from_fork+0x10/0x18 Fixes: 485d65e13571 ("net/mlx5: Add a timeout to acquire the command queue semaphore") Signed-off-by: Chenguang Zhao Reviewed-by: Moshe Shemesh Acked-by: Tariq Toukan Link: https://patch.msgid.link/20250108030009.68520-1-zhaochenguang@kylinos.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 6bd8a18e3af3..e733b81e18a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1013,6 +1013,7 @@ static void cmd_work_handler(struct work_struct *work) complete(&ent->done); } up(&cmd->vars.sem); + complete(&ent->slotted); return; } } else { From d58200966ed7985be48d342e99a5e81bc481821c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:35 -0800 Subject: [PATCH 41/57] MAINTAINERS: mark Synopsys DW XPCS as Orphan There's not much review support from Jose, there is a sharp drop in his participation around 4 years ago. The DW XPCS IP is very popular and the driver requires active maintenance. gitdm missingmaints says: Subsystem SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER Changes 33 / 94 (35%) (No activity) Top reviewers: [16]: andrew@lunn.ch [12]: vladimir.oltean@nxp.com [2]: f.fainelli@gmail.com INACTIVE MAINTAINER Jose Abreu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250108155242.2575530-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index b1777b53c63a..2a5f5f49269f 100644 --- a/CREDITS +++ b/CREDITS @@ -20,6 +20,10 @@ N: Thomas Abraham E: thomas.ab@samsung.com D: Samsung pin controller driver +N: Jose Abreu +E: jose.abreu@synopsys.com +D: Synopsys DesignWare XPCS MDIO/PCS driver. + N: Dragos Acostachioaie E: dragos@iname.com W: http://www.arbornet.org/~dragos diff --git a/MAINTAINERS b/MAINTAINERS index 52378f994294..d92c8cd9f805 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22737,9 +22737,8 @@ S: Supported F: drivers/net/ethernet/synopsys/ SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER -M: Jose Abreu L: netdev@vger.kernel.org -S: Supported +S: Orphan F: drivers/net/pcs/pcs-xpcs.c F: drivers/net/pcs/pcs-xpcs.h F: include/linux/pcs/pcs-xpcs.h From b506668613ef9138cac7479a5dd47559835b6552 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:36 -0800 Subject: [PATCH 42/57] MAINTAINERS: update maintainers for Microchip LAN78xx Woojung Huh seems to have only replied to the list 35 times in the last 5 years, and didn't provide any reviews in 3 years. The LAN78XX driver has seen quite a bit of activity lately. gitdm missingmaints says: Subsystem USB LAN78XX ETHERNET DRIVER Changes 35 / 91 (38%) (No activity) Top reviewers: [23]: andrew@lunn.ch [3]: horms@kernel.org [2]: mateusz.polchlopek@intel.com INACTIVE MAINTAINER Woojung Huh Move Woojung to CREDITS and add new maintainers who are more likely to review LAN78xx patches. Acked-by: Woojung Huh Acked-by: Rengarajan Sundararajan Link: https://patch.msgid.link/20250108155242.2575530-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 2a5f5f49269f..7a5332907ef0 100644 --- a/CREDITS +++ b/CREDITS @@ -1816,6 +1816,10 @@ D: Author/maintainer of most DRM drivers (especially ATI, MGA) D: Core DRM templates, general DRM and 3D-related hacking S: No fixed address +N: Woojung Huh +E: woojung.huh@microchip.com +D: Microchip LAN78XX USB Ethernet driver + N: Kenn Humborg E: kenn@wombat.ie D: Mods to loop device to support sparse backing files diff --git a/MAINTAINERS b/MAINTAINERS index d92c8cd9f805..58fb3c81c735 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24252,7 +24252,8 @@ F: Documentation/devicetree/bindings/usb/nxp,isp1760.yaml F: drivers/usb/isp1760/* USB LAN78XX ETHERNET DRIVER -M: Woojung Huh +M: Thangaraj Samynathan +M: Rengarajan Sundararajan M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained From e049fb86d39139050bb792b17ef86c3918cc8068 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:37 -0800 Subject: [PATCH 43/57] MAINTAINERS: remove Andy Gospodarek from bonding Andy does not participate much in bonding reviews, unfortunately. Move him to CREDITS. gitdm missingmaint says: Subsystem BONDING DRIVER Changes 149 / 336 (44%) Last activity: 2024-09-05 Jay Vosburgh : Tags 68db604e16d5 2024-09-05 00:00:00 8 Andy Gospodarek : Top reviewers: [65]: jay.vosburgh@canonical.com [23]: liuhangbin@gmail.com [16]: razor@blackwall.org INACTIVE MAINTAINER Andy Gospodarek Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250108155242.2575530-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 7a5332907ef0..cda68f04d5f1 100644 --- a/CREDITS +++ b/CREDITS @@ -1432,6 +1432,10 @@ S: 8124 Constitution Apt. 7 S: Sterling Heights, Michigan 48313 S: USA +N: Andy Gospodarek +E: andy@greyhouse.net +D: Maintenance and contributions to the network interface bonding driver. + N: Wolfgang Grandegger E: wg@grandegger.com D: Controller Area Network (device drivers) diff --git a/MAINTAINERS b/MAINTAINERS index 58fb3c81c735..c518bda0215f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4058,7 +4058,6 @@ F: net/bluetooth/ BONDING DRIVER M: Jay Vosburgh -M: Andy Gospodarek L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/bonding.rst From 03868822c553e549ac5c28781c29f80bddee5487 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:38 -0800 Subject: [PATCH 44/57] MAINTAINERS: mark stmmac ethernet as an Orphan I tried a couple of things to reinvigorate the stmmac maintainers over the last few years but with little effect. The maintainers are not active, let the MAINTAINERS file reflect reality. The Synopsys IP this driver supports is very popular we need a solid maintainer to deal with the complexity of the driver. gitdm missingmaints says: Subsystem STMMAC ETHERNET DRIVER Changes 344 / 978 (35%) Last activity: 2020-05-01 Alexandre Torgue : Tags 1bb694e20839 2020-05-01 00:00:00 1 Jose Abreu : Top reviewers: [75]: horms@kernel.org [49]: andrew@lunn.ch [46]: fancer.lancer@gmail.com INACTIVE MAINTAINER Jose Abreu Acked-by: Alexandre Torgue Link: https://patch.msgid.link/20250108155242.2575530-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c518bda0215f..955092ed27d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22500,11 +22500,8 @@ F: Documentation/devicetree/bindings/phy/st,stm32mp25-combophy.yaml F: drivers/phy/st/phy-stm32-combophy.c STMMAC ETHERNET DRIVER -M: Alexandre Torgue -M: Jose Abreu L: netdev@vger.kernel.org -S: Supported -W: http://www.stlinux.com +S: Orphan F: Documentation/networking/device_drivers/ethernet/stmicro/ F: drivers/net/ethernet/stmicro/stmmac/ From 9d7b1191d030bb0f6932722755b1103a2207421d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:39 -0800 Subject: [PATCH 45/57] MAINTAINERS: remove Mark Lee from MediaTek Ethernet The mailing lists have seen no email from Mark Lee in the last 4 years. gitdm missingmaints says: Subsystem MEDIATEK ETHERNET DRIVER Changes 103 / 400 (25%) Last activity: 2024-12-19 Felix Fietkau : Author 88806efc034a 2024-10-17 00:00:00 44 Tags 88806efc034a 2024-10-17 00:00:00 51 Sean Wang : Tags a5d75538295b 2020-04-07 00:00:00 1 Mark Lee : Lorenzo Bianconi : Author 0c7469ee718e 2024-12-19 00:00:00 123 Tags 0c7469ee718e 2024-12-19 00:00:00 139 Top reviewers: [32]: horms@kernel.org [15]: leonro@nvidia.com [9]: andrew@lunn.ch INACTIVE MAINTAINER Mark Lee Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250108155242.2575530-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 955092ed27d6..82157f7e01e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14565,7 +14565,6 @@ F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER M: Felix Fietkau M: Sean Wang -M: Mark Lee M: Lorenzo Bianconi L: netdev@vger.kernel.org S: Maintained From d4782fbab1c06fe1a3b1e064d2d6efd3e281e805 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:40 -0800 Subject: [PATCH 46/57] MAINTAINERS: remove Ying Xue from TIPC There is a steady stream of fixes for TIPC, even tho the development has slowed down a lot. Over last 2 years we have merged almost 70 TIPC patches, but we haven't heard from Ying Xue once: Subsystem TIPC NETWORK LAYER Changes 42 / 69 (60%) Last activity: 2023-10-04 Jon Maloy : Tags 08e50cf07184 2023-10-04 00:00:00 6 Ying Xue : Top reviewers: [9]: horms@kernel.org [8]: tung.q.nguyen@dektech.com.au [4]: jiri@nvidia.com [3]: tung.q.nguyen@endava.com [2]: kuniyu@amazon.com INACTIVE MAINTAINER Ying Xue Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250108155242.2575530-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 82157f7e01e4..b060b1fe8762 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23641,7 +23641,6 @@ F: tools/testing/selftests/timers/ TIPC NETWORK LAYER M: Jon Maloy -M: Ying Xue L: netdev@vger.kernel.org (core kernel code) L: tipc-discussion@lists.sourceforge.net (user apps, general discussion) S: Maintained From d95e2cc737017de537fc07cfc7d59307182bd0bc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:41 -0800 Subject: [PATCH 47/57] MAINTAINERS: remove Noam Dagan from AMAZON ETHERNET Noam Dagan was added to ENA reviewers in 2021, we have not seen a single email from this person to any list, ever (according to lore). Git history mentions the name in 2 SoB tags from 2020. Acked-by: Arthur Kiyanovski Link: https://patch.msgid.link/20250108155242.2575530-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b060b1fe8762..4996219d95b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -949,7 +949,6 @@ AMAZON ETHERNET DRIVERS M: Shay Agroskin M: Arthur Kiyanovski R: David Arinzon -R: Noam Dagan R: Saeed Bishara L: netdev@vger.kernel.org S: Supported From d9e03c6ffc4cd92c99418afc970ea8c8c53c66a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:42 -0800 Subject: [PATCH 48/57] MAINTAINERS: remove Lars Povlsen from Microchip Sparx5 SoC We have not seen emails or tags from Lars in almost 4 years. Steen and Daniel are pretty active, but the review coverage isn't stellar (35% of changes go in without a review tag). Subsystem ARM/Microchip Sparx5 SoC support Changes 28 / 79 (35%) Last activity: 2024-11-24 Lars Povlsen : Steen Hegelund : Tags 6c7c4b91aa43 2024-04-08 00:00:00 15 Daniel Machon : Author 48ba00da2eb4 2024-04-09 00:00:00 2 Tags f164b296638d 2024-11-24 00:00:00 6 Top reviewers: [7]: horms@kernel.org [1]: jacob.e.keller@intel.com [1]: jensemil.schulzostergaard@microchip.com [1]: horatiu.vultur@microchip.com INACTIVE MAINTAINER Lars Povlsen Acked-by: Daniel Machon Link: https://patch.msgid.link/20250108155242.2575530-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4996219d95b2..4e93a00df185 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2689,7 +2689,6 @@ N: at91 N: atmel ARM/Microchip Sparx5 SoC support -M: Lars Povlsen M: Steen Hegelund M: Daniel Machon M: UNGLinuxDriver@microchip.com From 771ec78dc8b48d562e6015bb535ed3cd37043d78 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:29 +0100 Subject: [PATCH 49/57] mptcp: sysctl: avail sched: remove write access 'net.mptcp.available_schedulers' sysctl knob is there to list available schedulers, not to modify this list. There are then no reasons to give write access to it. Nothing would have been written anyway, but no errors would have been returned, which is unexpected. Fixes: 73c900aa3660 ("mptcp: add net.mptcp.available_schedulers") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-1-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 38d8121331d4..d9b57fab2a13 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -228,7 +228,7 @@ static struct ctl_table mptcp_sysctl_table[] = { { .procname = "available_schedulers", .maxlen = MPTCP_SCHED_BUF_MAX, - .mode = 0644, + .mode = 0444, .proc_handler = proc_available_schedulers, }, { From d38e26e36206ae3d544d496513212ae931d1da0a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:30 +0100 Subject: [PATCH 50/57] mptcp: sysctl: sched: avoid using current->nsproxy Using the 'net' structure via 'current' is not recommended for different reasons. First, if the goal is to use it to read or write per-netns data, this is inconsistent with how the "generic" sysctl entries are doing: directly by only using pointers set to the table entry, e.g. table->data. Linked to that, the per-netns data should always be obtained from the table linked to the netns it had been created for, which may not coincide with the reader's or writer's netns. Another reason is that access to current->nsproxy->netns can oops if attempted when current->nsproxy had been dropped when the current task is exiting. This is what syzbot found, when using acct(2): Oops: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 1 UID: 0 PID: 5924 Comm: syz-executor Not tainted 6.13.0-rc5-syzkaller-00004-gccb98ccef0e5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_sys_call_handler+0x403/0x5d0 fs/proc/proc_sysctl.c:601 __kernel_write_iter+0x318/0xa80 fs/read_write.c:612 __kernel_write+0xf6/0x140 fs/read_write.c:632 do_acct_process+0xcb0/0x14a0 kernel/acct.c:539 acct_pin_kill+0x2d/0x100 kernel/acct.c:192 pin_kill+0x194/0x7c0 fs/fs_pin.c:44 mnt_pin_kill+0x61/0x1e0 fs/fs_pin.c:81 cleanup_mnt+0x3ac/0x450 fs/namespace.c:1366 task_work_run+0x14e/0x250 kernel/task_work.c:239 exit_task_work include/linux/task_work.h:43 [inline] do_exit+0xad8/0x2d70 kernel/exit.c:938 do_group_exit+0xd3/0x2a0 kernel/exit.c:1087 get_signal+0x2576/0x2610 kernel/signal.c:3017 arch_do_signal_or_restart+0x90/0x7e0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xda/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fee3cb87a6a Code: Unable to access opcode bytes at 0x7fee3cb87a40. RSP: 002b:00007fffcccac688 EFLAGS: 00000202 ORIG_RAX: 0000000000000037 RAX: 0000000000000000 RBX: 00007fffcccac710 RCX: 00007fee3cb87a6a RDX: 0000000000000041 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 00007fffcccac6ac R09: 00007fffcccacac7 R10: 00007fffcccac710 R11: 0000000000000202 R12: 00007fee3cd49500 R13: 00007fffcccac6ac R14: 0000000000000000 R15: 00007fee3cd4b000 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 ---------------- Code disassembly (best guess), 1 bytes skipped: 0: 42 80 3c 38 00 cmpb $0x0,(%rax,%r15,1) 5: 0f 85 fe 02 00 00 jne 0x309 b: 4d 8b a4 24 08 09 00 mov 0x908(%r12),%r12 12: 00 13: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 1a: fc ff df 1d: 49 8d 7c 24 28 lea 0x28(%r12),%rdi 22: 48 89 fa mov %rdi,%rdx 25: 48 c1 ea 03 shr $0x3,%rdx * 29: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) <-- trapping instruction 2d: 0f 85 cc 02 00 00 jne 0x2ff 33: 4d 8b 7c 24 28 mov 0x28(%r12),%r15 38: 48 rex.W 39: 8d .byte 0x8d 3a: 84 24 c8 test %ah,(%rax,%rcx,8) Here with 'net.mptcp.scheduler', the 'net' structure is not really needed, because the table->data already has a pointer to the current scheduler, the only thing needed from the per-netns data. Simply use 'data', instead of getting (most of the time) the same thing, but from a longer and indirect way. Fixes: 6963c508fd7a ("mptcp: only allow set existing scheduler for net.mptcp.scheduler") Cc: stable@vger.kernel.org Reported-by: syzbot+e364f774c6f57f2c86d1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-2-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index d9b57fab2a13..81c30aa02196 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -102,16 +102,15 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) } #ifdef CONFIG_SYSCTL -static int mptcp_set_scheduler(const struct net *net, const char *name) +static int mptcp_set_scheduler(char *scheduler, const char *name) { - struct mptcp_pernet *pernet = mptcp_get_pernet(net); struct mptcp_sched_ops *sched; int ret = 0; rcu_read_lock(); sched = mptcp_sched_find(name); if (sched) - strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX); + strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); @@ -122,7 +121,7 @@ static int mptcp_set_scheduler(const struct net *net, const char *name) static int proc_scheduler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - const struct net *net = current->nsproxy->net_ns; + char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; char val[MPTCP_SCHED_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -130,11 +129,11 @@ static int proc_scheduler(const struct ctl_table *ctl, int write, }; int ret; - strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX); + strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = mptcp_set_scheduler(net, val); + ret = mptcp_set_scheduler(*scheduler, val); return ret; } From 92cf7a51bdae24a32c592adcdd59a773ae149289 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:31 +0100 Subject: [PATCH 51/57] mptcp: sysctl: blackhole timeout: avoid using current->nsproxy As mentioned in the previous commit, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'pernet' structure can be obtained from the table->data using container_of(). Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-3-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 81c30aa02196..b0dd008e2114 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -160,7 +160,9 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns); + struct mptcp_pernet *pernet = container_of(table->data, + struct mptcp_pernet, + blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); From ea62dd1383913b5999f3d16ae99d411f41b528d4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:32 +0100 Subject: [PATCH 52/57] sctp: sysctl: cookie_hmac_alg: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.sctp_hmac_alg' is used. Fixes: 3c68198e7511 ("sctp: Make hmac algorithm selection for cookie generation dynamic") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-4-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index e5a5af343c4c..9848d19630a4 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table sctp_net_table[] = { static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.sctp_hmac_alg); struct ctl_table tbl; bool changed = false; char *none = "none"; From 9fc17b76fc70763780aa78b38fcf4742384044a5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:33 +0100 Subject: [PATCH 53/57] sctp: sysctl: rto_min/max: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.rto_min/max' is used. Fixes: 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-5-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9848d19630a4..a5285815264d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -433,7 +433,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_min); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; @@ -461,7 +461,7 @@ static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_max); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; From 15649fd5415eda664ef35780c2013adeb5d9c695 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:34 +0100 Subject: [PATCH 54/57] sctp: sysctl: auth_enable: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: b14878ccb7fa ("net: sctp: cache auth_enable per endpoint") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-6-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index a5285815264d..9d29611621fe 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -499,7 +499,7 @@ static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.auth_enable); struct ctl_table tbl; int new_value, ret; From c10377bbc1972d858eaf0ab366a311b39f8ef1b6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:35 +0100 Subject: [PATCH 55/57] sctp: sysctl: udp_port: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-7-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9d29611621fe..18fa4f44e8ec 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -528,7 +528,7 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.udp_port); unsigned int min = *(unsigned int *)ctl->extra1; unsigned int max = *(unsigned int *)ctl->extra2; struct ctl_table tbl; From 6259d2484d0ceff42245d1f09cc8cb6ee72d847a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:36 +0100 Subject: [PATCH 56/57] sctp: sysctl: plpmtud_probe_interval: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.probe_interval' is used. Fixes: d1e462a7a5f3 ("sctp: add probe_interval in sysctl and sock/asoc/transport") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-8-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 18fa4f44e8ec..8e1e97be4df7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -569,7 +569,8 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.probe_interval); struct ctl_table tbl; int ret, new_value; From 7f5611cbc4871c7fb1ad36c2e5a9edad63dca95c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:37 +0100 Subject: [PATCH 57/57] rds: sysctl: rds_tcp_{rcv,snd}buf: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The per-netns structure can be obtained from the table->data using container_of(), then the 'net' one can be retrieved from the listen socket (if available). Fixes: c6a58ffed536 ("RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-9-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/rds/tcp.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351ac1747224..0581c53e6517 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -61,8 +61,10 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, - void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -74,7 +76,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_sndbuf_handler, .extra1 = &rds_tcp_min_sndbuf, }, #define RDS_TCP_RCVBUF 1 @@ -83,7 +85,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_rcvbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, }; @@ -682,10 +684,10 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_unlock_irq(&rds_tcp_conn_lock); } -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, +static int rds_tcp_skbuf_handler(struct rds_tcp_net *rtn, + const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *fpos) { - struct net *net = current->nsproxy->net_ns; int err; err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); @@ -694,11 +696,34 @@ static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, *(int *)(ctl->extra1)); return err; } - if (write) + + if (write && rtn->rds_tcp_listen_sock && rtn->rds_tcp_listen_sock->sk) { + struct net *net = sock_net(rtn->rds_tcp_listen_sock->sk); + rds_tcp_sysctl_reset(net); + } + return 0; } +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + sndbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + rcvbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + static void rds_tcp_exit(void) { rds_tcp_set_unloading();