Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says: ==================== pull-request: bpf-next 2018-01-19 The following pull-request contains BPF updates for your *net-next* tree. The main changes are: 1) bpf array map HW offload, from Jakub. 2) support for bpf_get_next_key() for LPM map, from Yonghong. 3) test_verifier now runs loaded programs, from Alexei. 4) xdp cpumap monitoring, from Jesper. 5) variety of tests, cleanups and small x64 JIT optimization, from Daniel. 6) user space can now retrieve HW JITed program, from Jiong. Note there is a minor conflict between Russell's arm32 JIT fixes and removal of bpf_jit_enable variable by Daniel which should be resolved by keeping Russell's comment and removing that variable. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2026-05-01 15:00:59 -07:00 · 2018-01-20 22:03:46 -05:00
parent 8565d26bcb 1391040b65
commit ea9722e265
44 changed files with 1835 additions and 229 deletions
@@ -25,8 +25,6 @@

 #include "bpf_jit_32.h"

-int bpf_jit_enable __read_mostly;
-
 /*
 * eBPF prog stack layout:
 *
@@ -31,8 +31,6 @@

 #include "bpf_jit.h"

-int bpf_jit_enable __read_mostly;
-
 #define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
@@ -1207,8 +1207,6 @@ jmp_cmp:
 	return 0;
 }

-int bpf_jit_enable __read_mostly;
-
 void bpf_jit_compile(struct bpf_prog *fp)
 {
 	struct jit_ctx ctx;
@@ -177,8 +177,6 @@ static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx)
 		(ctx->idx * 4) - 4;
 }

-int bpf_jit_enable __read_mostly;
-
 enum which_ebpf_reg {
 	src_reg,
 	src_reg_no_fp,
@@ -18,8 +18,6 @@

 #include "bpf_jit32.h"

-int bpf_jit_enable __read_mostly;
-
 static inline void bpf_flush_icache(void *start, void *end)
 {
 	smp_wmb();
@@ -21,8 +21,6 @@

 #include "bpf_jit64.h"

-int bpf_jit_enable __read_mostly;
-
 static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
 {
 	memset32(area, BREAKPOINT_INSTRUCTION, size/4);
@@ -28,8 +28,6 @@
 #include <asm/set_memory.h>
 #include "bpf_jit.h"

-int bpf_jit_enable __read_mostly;
-
 struct bpf_jit {
 	u32 seen;		/* Flags to remember seen eBPF instructions */
 	u32 seen_reg[16];	/* Array to remember which registers are used */
@@ -11,8 +11,6 @@

 #include "bpf_jit_32.h"

-int bpf_jit_enable __read_mostly;
-
 static inline bool is_simm13(unsigned int value)
 {
 	return value + 0x1000 < 0x2000;
@@ -12,8 +12,6 @@

 #include "bpf_jit_64.h"

-int bpf_jit_enable __read_mostly;
-
 static inline bool is_simm13(unsigned int value)
 {
 	return value + 0x1000 < 0x2000;
@@ -15,8 +15,6 @@
 #include <asm/set_memory.h>
 #include <linux/bpf.h>

-int bpf_jit_enable __read_mostly;
-
 /*
 * assembly code in arch/x86/net/bpf_jit.S
 */
@@ -154,6 +152,11 @@ static bool is_ereg(u32 reg)
 			     BIT(BPF_REG_AX));
 }

+static bool is_axreg(u32 reg)
+{
+	return reg == BPF_REG_0;
+}
+
 /* add modifiers if 'reg' maps to x64 registers r8..r15 */
 static u8 add_1mod(u8 byte, u32 reg)
 {
@@ -447,16 +450,36 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			else if (is_ereg(dst_reg))
 				EMIT1(add_1mod(0x40, dst_reg));

+			/* b3 holds 'normal' opcode, b2 short form only valid
+			 * in case dst is eax/rax.
+			 */
 			switch (BPF_OP(insn->code)) {
-			case BPF_ADD: b3 = 0xC0; break;
-			case BPF_SUB: b3 = 0xE8; break;
-			case BPF_AND: b3 = 0xE0; break;
-			case BPF_OR: b3 = 0xC8; break;
-			case BPF_XOR: b3 = 0xF0; break;
+			case BPF_ADD:
+				b3 = 0xC0;
+				b2 = 0x05;
+				break;
+			case BPF_SUB:
+				b3 = 0xE8;
+				b2 = 0x2D;
+				break;
+			case BPF_AND:
+				b3 = 0xE0;
+				b2 = 0x25;
+				break;
+			case BPF_OR:
+				b3 = 0xC8;
+				b2 = 0x0D;
+				break;
+			case BPF_XOR:
+				b3 = 0xF0;
+				b2 = 0x35;
+				break;
 			}

 			if (is_imm8(imm32))
 				EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
+			else if (is_axreg(dst_reg))
+				EMIT1_off32(b2, imm32);
 			else
 				EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
 			break;
@@ -157,7 +157,14 @@ nfp_bpf_cmsg_wait_reply(struct nfp_app_bpf *bpf, enum nfp_bpf_cmsg_type type,
 			int tag)
 {
 	struct sk_buff *skb;
-	int err;
+	int i, err;
+
+	for (i = 0; i < 50; i++) {
+		udelay(4);
+		skb = nfp_bpf_reply(bpf, tag);
+		if (skb)
+			return skb;
+	}

 	err = wait_event_interruptible_timeout(bpf->cmsg_wq,
 					       skb = nfp_bpf_reply(bpf, tag),
@@ -127,6 +127,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
 	unsigned int stack_size;
 	unsigned int max_instr;
+	int err;

 	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
 	if (prog->aux->stack_depth > stack_size) {
@@ -143,7 +144,14 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 	if (!nfp_prog->prog)
 		return -ENOMEM;

-	return nfp_bpf_jit(nfp_prog);
+	err = nfp_bpf_jit(nfp_prog);
+	if (err)
+		return err;
+
+	prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64);
+	prog->aux->offload->jited_image = nfp_prog->prog;
+
+	return 0;
 }

 static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
@@ -168,6 +176,8 @@ nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap,
 static int
 nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key)
 {
+	if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY)
+		return -EINVAL;
 	return nfp_bpf_ctrl_del_entry(offmap, key);
 }

@@ -17,6 +17,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
+#include <linux/mutex.h>
 #include <linux/rtnetlink.h>
 #include <net/pkt_cls.h>

@@ -31,6 +32,19 @@ struct nsim_bpf_bound_prog {
 	struct list_head l;
 };

+#define NSIM_BPF_MAX_KEYS		2
+
+struct nsim_bpf_bound_map {
+	struct netdevsim *ns;
+	struct bpf_offloaded_map *map;
+	struct mutex mutex;
+	struct nsim_map_entry {
+		void *key;
+		void *value;
+	} entry[NSIM_BPF_MAX_KEYS];
+	struct list_head l;
+};
+
 static int nsim_debugfs_bpf_string_read(struct seq_file *file, void *data)
 {
 	const char **str = file->private;
@@ -284,6 +298,224 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
 	return 0;
 }

+static bool
+nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key)
+{
+	return e->key && !memcmp(key, e->key, map->key_size);
+}
+
+static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(nmap->entry); i++)
+		if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key))
+			return i;
+
+	return -ENOENT;
+}
+
+static int
+nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+
+	nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_USER);
+	if (!nmap->entry[idx].key)
+		return -ENOMEM;
+	nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_USER);
+	if (!nmap->entry[idx].value) {
+		kfree(nmap->entry[idx].key);
+		nmap->entry[idx].key = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int
+nsim_map_get_next_key(struct bpf_offloaded_map *offmap,
+		      void *key, void *next_key)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx = -ENOENT;
+
+	mutex_lock(&nmap->mutex);
+
+	if (key)
+		idx = nsim_map_key_find(offmap, key);
+	if (idx == -ENOENT)
+		idx = 0;
+	else
+		idx++;
+
+	for (; idx < ARRAY_SIZE(nmap->entry); idx++) {
+		if (nmap->entry[idx].key) {
+			memcpy(next_key, nmap->entry[idx].key,
+			       offmap->map.key_size);
+			break;
+		}
+	}
+
+	mutex_unlock(&nmap->mutex);
+
+	if (idx == ARRAY_SIZE(nmap->entry))
+		return -ENOENT;
+	return 0;
+}
+
+static int
+nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx;
+
+	mutex_lock(&nmap->mutex);
+
+	idx = nsim_map_key_find(offmap, key);
+	if (idx >= 0)
+		memcpy(value, nmap->entry[idx].value, offmap->map.value_size);
+
+	mutex_unlock(&nmap->mutex);
+
+	return idx < 0 ? idx : 0;
+}
+
+static int
+nsim_map_update_elem(struct bpf_offloaded_map *offmap,
+		     void *key, void *value, u64 flags)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx, err = 0;
+
+	mutex_lock(&nmap->mutex);
+
+	idx = nsim_map_key_find(offmap, key);
+	if (idx < 0 && flags == BPF_EXIST) {
+		err = idx;
+		goto exit_unlock;
+	}
+	if (idx >= 0 && flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto exit_unlock;
+	}
+
+	if (idx < 0) {
+		for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++)
+			if (!nmap->entry[idx].key)
+				break;
+		if (idx == ARRAY_SIZE(nmap->entry)) {
+			err = -E2BIG;
+			goto exit_unlock;
+		}
+
+		err = nsim_map_alloc_elem(offmap, idx);
+		if (err)
+			goto exit_unlock;
+	}
+
+	memcpy(nmap->entry[idx].key, key, offmap->map.key_size);
+	memcpy(nmap->entry[idx].value, value, offmap->map.value_size);
+exit_unlock:
+	mutex_unlock(&nmap->mutex);
+
+	return err;
+}
+
+static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx;
+
+	if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY)
+		return -EINVAL;
+
+	mutex_lock(&nmap->mutex);
+
+	idx = nsim_map_key_find(offmap, key);
+	if (idx >= 0) {
+		kfree(nmap->entry[idx].key);
+		kfree(nmap->entry[idx].value);
+		memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx]));
+	}
+
+	mutex_unlock(&nmap->mutex);
+
+	return idx < 0 ? idx : 0;
+}
+
+static const struct bpf_map_dev_ops nsim_bpf_map_ops = {
+	.map_get_next_key	= nsim_map_get_next_key,
+	.map_lookup_elem	= nsim_map_lookup_elem,
+	.map_update_elem	= nsim_map_update_elem,
+	.map_delete_elem	= nsim_map_delete_elem,
+};
+
+static int
+nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap)
+{
+	struct nsim_bpf_bound_map *nmap;
+	unsigned int i;
+	int err;
+
+	if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY &&
+		    offmap->map.map_type != BPF_MAP_TYPE_HASH))
+		return -EINVAL;
+	if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS)
+		return -ENOMEM;
+	if (offmap->map.map_flags)
+		return -EINVAL;
+
+	nmap = kzalloc(sizeof(*nmap), GFP_USER);
+	if (!nmap)
+		return -ENOMEM;
+
+	offmap->dev_priv = nmap;
+	nmap->ns = ns;
+	nmap->map = offmap;
+	mutex_init(&nmap->mutex);
+
+	if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) {
+		for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) {
+			u32 *key;
+
+			err = nsim_map_alloc_elem(offmap, i);
+			if (err)
+				goto err_free;
+			key = nmap->entry[i].key;
+			*key = i;
+		}
+	}
+
+	offmap->dev_ops = &nsim_bpf_map_ops;
+	list_add_tail(&nmap->l, &ns->bpf_bound_maps);
+
+	return 0;
+
+err_free:
+	while (--i) {
+		kfree(nmap->entry[i].key);
+		kfree(nmap->entry[i].value);
+	}
+	kfree(nmap);
+	return err;
+}
+
+static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) {
+		kfree(nmap->entry[i].key);
+		kfree(nmap->entry[i].value);
+	}
+	list_del_init(&nmap->l);
+	mutex_destroy(&nmap->mutex);
+	kfree(nmap);
+}
+
 int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -328,6 +560,14 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 			return err;

 		return nsim_xdp_set_prog(ns, bpf);
+	case BPF_OFFLOAD_MAP_ALLOC:
+		if (!ns->bpf_map_accept)
+			return -EOPNOTSUPP;
+
+		return nsim_bpf_map_alloc(ns, bpf->offmap);
+	case BPF_OFFLOAD_MAP_FREE:
+		nsim_bpf_map_free(bpf->offmap);
+		return 0;
 	default:
 		return -EINVAL;
 	}
@@ -336,6 +576,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 int nsim_bpf_init(struct netdevsim *ns)
 {
 	INIT_LIST_HEAD(&ns->bpf_bound_progs);
+	INIT_LIST_HEAD(&ns->bpf_bound_maps);

 	debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir,
 			   &ns->bpf_offloaded_id);
@@ -362,12 +603,17 @@ int nsim_bpf_init(struct netdevsim *ns)
 	debugfs_create_bool("bpf_xdpoffload_accept", 0600, ns->ddir,
 			    &ns->bpf_xdpoffload_accept);

+	ns->bpf_map_accept = true;
+	debugfs_create_bool("bpf_map_accept", 0600, ns->ddir,
+			    &ns->bpf_map_accept);
+
 	return 0;
 }

 void nsim_bpf_uninit(struct netdevsim *ns)
 {
 	WARN_ON(!list_empty(&ns->bpf_bound_progs));
+	WARN_ON(!list_empty(&ns->bpf_bound_maps));
 	WARN_ON(ns->xdp_prog);
 	WARN_ON(ns->bpf_offloaded);
 }
@@ -61,6 +61,9 @@ struct netdevsim {
 	bool bpf_tc_non_bound_accept;
 	bool bpf_xdpdrv_accept;
 	bool bpf_xdpoffload_accept;
+
+	bool bpf_map_accept;
+	struct list_head bpf_bound_maps;
 };

 extern struct dentry *nsim_ddir;
@@ -234,6 +234,8 @@ struct bpf_prog_offload {
 	struct list_head	offloads;
 	bool			dev_state;
 	const struct bpf_prog_offload_ops *dev_ops;
+	void			*jited_image;
+	u32			jited_len;
 };

 struct bpf_prog_aux {
@@ -584,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);

+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);
+
 int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
 int bpf_map_offload_update_elem(struct bpf_map *map,
 				void *key, void *value, u64 flags);
@@ -17,7 +17,7 @@
 #define BPF_ALU64	0x07	/* alu mode in double word width */

 /* ld/ldx fields */
-#define BPF_DW		0x18	/* double word */
+#define BPF_DW		0x18	/* double word (64-bit) */
 #define BPF_XADD	0xc0	/* exclusive add */

 /* alu/jmp fields */
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));

 /* User bpf_sock_ops struct to access socket values and specify request ops
@@ -15,9 +15,10 @@

 /* ld/ldx fields */
 #define BPF_SIZE(code)  ((code) & 0x18)
-#define		BPF_W		0x00
-#define		BPF_H		0x08
-#define		BPF_B		0x10
+#define		BPF_W		0x00 /* 32-bit */
+#define		BPF_H		0x08 /* 16-bit */
+#define		BPF_B		0x10 /*  8-bit */
+/* eBPF		BPF_DW		0x18    64-bit */
 #define BPF_MODE(code)  ((code) & 0xe0)
 #define		BPF_IMM		0x00
 #define		BPF_ABS		0x20
@@ -49,6 +49,27 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 }

 /* Called from syscall */
+static int array_map_alloc_check(union bpf_attr *attr)
+{
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+	int numa_node = bpf_map_attr_numa_node(attr);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+	    (percpu && numa_node != NUMA_NO_NODE))
+		return -EINVAL;
+
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		/* if value_size is bigger, the user space won't be able to
+		 * access the elements.
+		 */
+		return -E2BIG;
+
+	return 0;
+}
+
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
@@ -58,19 +79,6 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	struct bpf_array *array;
 	u64 array_size, mask64;

-	/* check sanity of attributes */
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 ||
-	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
-	    (percpu && numa_node != NUMA_NO_NODE))
-		return ERR_PTR(-EINVAL);
-
-	if (attr->value_size > KMALLOC_MAX_SIZE)
-		/* if value_size is bigger, the user space won't be able to
-		 * access the elements.
-		 */
-		return ERR_PTR(-E2BIG);
-
 	elem_size = round_up(attr->value_size, 8);

 	max_entries = attr->max_entries;
@@ -112,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.unpriv_array = unpriv;

 	/* copy mandatory map attributes */
-	array->map.map_type = attr->map_type;
-	array->map.key_size = attr->key_size;
-	array->map.value_size = attr->value_size;
-	array->map.max_entries = attr->max_entries;
-	array->map.map_flags = attr->map_flags;
-	array->map.numa_node = numa_node;
+	bpf_map_init_from_attr(&array->map, attr);
 	array->elem_size = elem_size;

 	if (!percpu)
@@ -327,6 +330,7 @@ static void array_map_free(struct bpf_map *map)
 }

 const struct bpf_map_ops array_map_ops = {
+	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -337,6 +341,7 @@ const struct bpf_map_ops array_map_ops = {
 };

 const struct bpf_map_ops percpu_array_map_ops = {
+	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -345,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };

-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
 {
 	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
-		return ERR_PTR(-EINVAL);
-	return array_map_alloc(attr);
+		return -EINVAL;
+	return array_map_alloc_check(attr);
 }

 static void fd_array_map_free(struct bpf_map *map)
@@ -474,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
 }

 const struct bpf_map_ops prog_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 }

 const struct bpf_map_ops perf_event_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -592,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 }

 const struct bpf_map_ops cgroup_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = cgroup_fd_array_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -610,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
 	if (IS_ERR(inner_map_meta))
 		return inner_map_meta;

-	map = fd_array_map_alloc(attr);
+	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
 		bpf_map_meta_free(inner_map_meta);
 		return map;
@@ -673,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 }

 const struct bpf_map_ops array_of_maps_map_ops = {
+	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_of_map_alloc,
 	.map_free = array_of_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }

 #ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden   __read_mostly;
+int bpf_jit_kallsyms __read_mostly;
+
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
 			 unsigned long *symbol_start,
@@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;

-int bpf_jit_kallsyms __read_mostly;
-
 static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
 	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	bpf_prog_unlock_free(fp);
 }

-int bpf_jit_harden __read_mostly;
-
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      const struct bpf_insn *aux,
 			      struct bpf_insn *to_buff)
@@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
 }

 #else
-static unsigned int __bpf_prog_ret0(const void *ctx,
-				    const struct bpf_insn *insn)
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
+					 const struct bpf_insn *insn)
 {
+	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+	 * is not working properly, so warn about it!
+	 */
+	WARN_ON_ONCE(1);
 	return 0;
 }
 #endif
@@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)

 	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 #else
-	fp->bpf_func = __bpf_prog_ret0;
+	fp->bpf_func = __bpf_prog_ret0_warn;
 #endif

 	/* eBPF JITs can rewrite the program in case constant
@@ -591,9 +591,100 @@ unlock:
 	raw_spin_unlock(&trie->lock);
 }

-static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
-	return -ENOTSUPP;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+	struct lpm_trie_node *node, *next_node = NULL, *parent;
+	struct lpm_trie_node **node_stack = NULL;
+	struct lpm_trie_node __rcu **root;
+	int err = 0, stack_ptr = -1;
+	unsigned int next_bit;
+	size_t matchlen;
+
+	/* The get_next_key follows postorder. For the 4 node example in
+	 * the top of this file, the trie_get_next_key() returns the following
+	 * one after another:
+	 *   192.168.0.0/24
+	 *   192.168.1.0/24
+	 *   192.168.128.0/24
+	 *   192.168.0.0/16
+	 *
+	 * The idea is to return more specific keys before less specific ones.
+	 */
+
+	/* Empty trie */
+	if (!rcu_dereference(trie->root))
+		return -ENOENT;
+
+	/* For invalid key, find the leftmost node in the trie */
+	if (!key || key->prefixlen > trie->max_prefixlen) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
+			     GFP_USER | __GFP_NOWARN);
+	if (!node_stack)
+		return -ENOMEM;
+
+	/* Try to find the exact node for the given key */
+	for (node = rcu_dereference(trie->root); node;) {
+		node_stack[++stack_ptr] = node;
+		matchlen = longest_prefix_match(trie, node, key);
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	/* The node with the exactly-matching key has been found,
+	 * find the first node in postorder after the matched node.
+	 */
+	node = node_stack[stack_ptr];
+	while (stack_ptr > 0) {
+		parent = node_stack[stack_ptr - 1];
+		if (rcu_dereference(parent->child[0]) == node &&
+		    rcu_dereference(parent->child[1])) {
+			root = &parent->child[1];
+			goto find_leftmost;
+		}
+		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+			next_node = parent;
+			goto do_copy;
+		}
+
+		node = parent;
+		stack_ptr--;
+	}
+
+	/* did not find anything */
+	err = -ENOENT;
+	goto free_stack;
+
+find_leftmost:
+	/* Find the leftmost non-intermediate node, all intermediate nodes
+	 * have exact two children, so this function will never return NULL.
+	 */
+	for (node = rcu_dereference(*root); node;) {
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			next_node = node;
+		node = rcu_dereference(node->child[0]);
+	}
+do_copy:
+	next_key->prefixlen = next_node->prefixlen;
+	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+	       next_node->data, trie->data_size);
+free_stack:
+	kfree(node_stack);
+	return err;
 }

 const struct bpf_map_ops trie_map_ops = {
--- a/Show More
+++ b/Show More