BPF fixes:

- Fix BPF verifier to force a checkpoint when the program's jump
   history becomes too long (Eduard Zingerman)
 
 - Add several fixes to the BPF bits iterator addressing issues
   like memory leaks and overflow problems (Hou Tao)
 
 - Fix an out-of-bounds write in trie_get_next_key (Byeonguk Jeong)
 
 - Fix BPF test infra's LIVE_FRAME frame update after a page has
   been recycled (Toke Høiland-Jørgensen)
 
 - Fix BPF verifier and undo the 40-bytes extra stack space for
   bpf_fastcall patterns due to various bugs (Eduard Zingerman)
 
 - Fix a BPF sockmap race condition which could trigger a NULL
   pointer dereference in sock_map_link_update_prog (Cong Wang)
 
 - Fix tcp_bpf_recvmsg_parser to retrieve seq_copied from tcp_sk
   under the socket lock (Jiayuan Chen)
 
 Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
 -----BEGIN PGP SIGNATURE-----
 
 iIsEABYIADMWIQTFp0I1jqZrAX+hPRXbK58LschIgwUCZyQO/RUcZGFuaWVsQGlv
 Z2VhcmJveC5uZXQACgkQ2yufC7HISIO2vAD+NAng11x6W9tnIOVDHTwvsWL4aafQ
 pmf1zda90bwCIyIA/07ptFPWOH+WTmWqP8pZ9PGY5279KAxurZZDud0SOwIO
 =28aY
 -----END PGP SIGNATURE-----

Merge tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Pull bpf fixes from Daniel Borkmann:

 - Fix BPF verifier to force a checkpoint when the program's jump
   history becomes too long (Eduard Zingerman)

 - Add several fixes to the BPF bits iterator addressing issues like
   memory leaks and overflow problems (Hou Tao)

 - Fix an out-of-bounds write in trie_get_next_key (Byeonguk Jeong)

 - Fix BPF test infra's LIVE_FRAME frame update after a page has been
   recycled (Toke Høiland-Jørgensen)

 - Fix BPF verifier and undo the 40-bytes extra stack space for
   bpf_fastcall patterns due to various bugs (Eduard Zingerman)

 - Fix a BPF sockmap race condition which could trigger a NULL pointer
   dereference in sock_map_link_update_prog (Cong Wang)

 - Fix tcp_bpf_recvmsg_parser to retrieve seq_copied from tcp_sk under
   the socket lock (Jiayuan Chen)

* tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  bpf, test_run: Fix LIVE_FRAME frame update after a page has been recycled
  selftests/bpf: Add three test cases for bits_iter
  bpf: Use __u64 to save the bits in bits iterator
  bpf: Check the validity of nr_words in bpf_iter_bits_new()
  bpf: Add bpf_mem_alloc_check_size() helper
  bpf: Free dynamically allocated bits in bpf_iter_bits_destroy()
  bpf: disallow 40-bytes extra stack for bpf_fastcall patterns
  selftests/bpf: Add test for trie_get_next_key()
  bpf: Fix out-of-bounds write in trie_get_next_key()
  selftests/bpf: Test with a very short loop
  bpf: Force checkpoint when jmp history is too long
  bpf: fix filed access without lock
  sock_map: fix a NULL pointer dereference in sock_map_link_update_prog()
This commit is contained in:
Linus Torvalds 2024-10-31 14:56:19 -10:00
commit 5635f18942
13 changed files with 269 additions and 88 deletions

View file

@ -33,6 +33,9 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
/* Check the allocation size for kmalloc equivalent allocator */
int bpf_mem_alloc_check_size(bool percpu, size_t size);
/* kmalloc/kfree equivalent: */
void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);

View file

@ -2851,21 +2851,47 @@ struct bpf_iter_bits {
__u64 __opaque[2];
} __aligned(8);
#define BITS_ITER_NR_WORDS_MAX 511
struct bpf_iter_bits_kern {
union {
unsigned long *bits;
unsigned long bits_copy;
__u64 *bits;
__u64 bits_copy;
};
u32 nr_bits;
int nr_bits;
int bit;
} __aligned(8);
/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
* a u64 pointer and an unsigned long pointer to find_next_bit() will
* return the same result, as both point to the same 8-byte area.
*
* For 32-bit little-endian hosts, using a u64 pointer or unsigned long
* pointer also makes no difference. This is because the first iterated
* unsigned long is composed of bits 0-31 of the u64 and the second unsigned
* long is composed of bits 32-63 of the u64.
*
* However, for 32-bit big-endian hosts, this is not the case. The first
* iterated unsigned long will be bits 32-63 of the u64, so swap these two
* ulong values within the u64.
*/
static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
{
#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
unsigned int i;
for (i = 0; i < nr; i++)
bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
#endif
}
/**
* bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
* @it: The new bpf_iter_bits to be created
* @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
* @nr_words: The size of the specified memory area, measured in 8-byte units.
* Due to the limitation of memalloc, it can't be greater than 512.
* The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
* further reduced by the BPF memory allocator implementation.
*
* This function initializes a new bpf_iter_bits structure for iterating over
* a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (!unsafe_ptr__ign || !nr_words)
return -EINVAL;
if (nr_words > BITS_ITER_NR_WORDS_MAX)
return -E2BIG;
/* Optimization for u64 mask */
if (nr_bits == 64) {
@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (err)
return -EFAULT;
swap_ulong_in_u64(&kit->bits_copy, nr_words);
kit->nr_bits = nr_bits;
return 0;
}
if (bpf_mem_alloc_check_size(false, nr_bytes))
return -E2BIG;
/* Fallback to memalloc */
kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
if (!kit->bits)
@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
return err;
}
swap_ulong_in_u64(kit->bits, nr_words);
kit->nr_bits = nr_bits;
return 0;
}
@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
{
struct bpf_iter_bits_kern *kit = (void *)it;
u32 nr_bits = kit->nr_bits;
const unsigned long *bits;
int bit;
int bit = kit->bit, nr_bits = kit->nr_bits;
const void *bits;
if (nr_bits == 0)
if (!nr_bits || bit >= nr_bits)
return NULL;
bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
bit = find_next_bit(bits, nr_bits, kit->bit + 1);
bit = find_next_bit(bits, nr_bits, bit + 1);
if (bit >= nr_bits) {
kit->nr_bits = 0;
kit->bit = bit;
return NULL;
}

View file

@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
node_stack = kmalloc_array(trie->max_prefixlen,
node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)

View file

@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
#define BPF_MEM_ALLOC_SIZE_MAX 4096
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
static int bpf_mem_cache_idx(size_t size)
{
if (!size || size > 4096)
if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;
if (size <= 192)
@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
int bpf_mem_alloc_check_size(bool percpu, size_t size)
{
/* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
(!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
return -E2BIG;
return 0;
}

View file

@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
struct bpf_func_state *state,
enum bpf_access_type t)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx];
int min_valid_off, max_bpf_stack;
/* If accessing instruction is a spill/fill from bpf_fastcall pattern,
* add room for all caller saved registers below MAX_BPF_STACK.
* In case if bpf_fastcall rewrite won't happen maximal stack depth
* would be checked by check_max_stack_depth_subprog().
*/
max_bpf_stack = MAX_BPF_STACK;
if (aux->fastcall_pattern)
max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE;
int min_valid_off;
if (t == BPF_WRITE || env->allow_uninit_stack)
min_valid_off = -max_bpf_stack;
min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@ -17886,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
bool add_new_state = force_new_state;
bool force_exact;
bool force_new_state, add_new_state, force_exact;
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
/* Avoid accumulating infinitely long jmp history */
cur->jmp_history_cnt > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@ -17898,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;

View file

@ -246,6 +246,7 @@ static void reset_ctx(struct xdp_page_head *head)
head->ctx.data_meta = head->orig_ctx.data_meta;
head->ctx.data_end = head->orig_ctx.data_end;
xdp_update_frame_from_buff(&head->ctx, head->frame);
head->frame->mem = head->orig_ctx.rxq->mem;
}
static int xdp_recv_frames(struct xdp_frame **frames, int nframes,

View file

@ -1760,6 +1760,10 @@ static int sock_map_link_update_prog(struct bpf_link *link,
ret = -EINVAL;
goto out;
}
if (!sockmap_link->map) {
ret = -ENOLINK;
goto out;
}
ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
sockmap_link->attach_type);

View file

@ -221,11 +221,11 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
int flags,
int *addr_len)
{
struct tcp_sock *tcp = tcp_sk(sk);
int peek = flags & MSG_PEEK;
u32 seq = tcp->copied_seq;
struct sk_psock *psock;
struct tcp_sock *tcp;
int copied = 0;
u32 seq;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@ -238,7 +238,8 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
return tcp_recvmsg(sk, msg, len, flags, addr_len);
lock_sock(sk);
tcp = tcp_sk(sk);
seq = tcp->copied_seq;
/* We may have received data on the sk_receive_queue pre-accept and
* then we can not use read_skb in this context because we haven't
* assigned a sk_socket yet so have no link to the ops. The work-around

View file

@ -0,0 +1,109 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <linux/bpf.h>
#include <stdio.h>
#include <stdbool.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <test_maps.h>
struct test_lpm_key {
__u32 prefix;
__u32 data;
};
struct get_next_key_ctx {
struct test_lpm_key key;
bool start;
bool stop;
int map_fd;
int loop;
};
static void *get_next_key_fn(void *arg)
{
struct get_next_key_ctx *ctx = arg;
struct test_lpm_key next_key;
int i = 0;
while (!ctx->start)
usleep(1);
while (!ctx->stop && i++ < ctx->loop)
bpf_map_get_next_key(ctx->map_fd, &ctx->key, &next_key);
return NULL;
}
static void abort_get_next_key(struct get_next_key_ctx *ctx, pthread_t *tids,
unsigned int nr)
{
unsigned int i;
ctx->stop = true;
ctx->start = true;
for (i = 0; i < nr; i++)
pthread_join(tids[i], NULL);
}
/* This test aims to prevent regression of future. As long as the kernel does
* not panic, it is considered as success.
*/
void test_lpm_trie_map_get_next_key(void)
{
#define MAX_NR_THREADS 8
LIBBPF_OPTS(bpf_map_create_opts, create_opts,
.map_flags = BPF_F_NO_PREALLOC);
struct test_lpm_key key = {};
__u32 val = 0;
int map_fd;
const __u32 max_prefixlen = 8 * (sizeof(key) - sizeof(key.prefix));
const __u32 max_entries = max_prefixlen + 1;
unsigned int i, nr = MAX_NR_THREADS, loop = 65536;
pthread_t tids[MAX_NR_THREADS];
struct get_next_key_ctx ctx;
int err;
map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie_map",
sizeof(struct test_lpm_key), sizeof(__u32),
max_entries, &create_opts);
CHECK(map_fd == -1, "bpf_map_create()", "error:%s\n",
strerror(errno));
for (i = 0; i <= max_prefixlen; i++) {
key.prefix = i;
err = bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
CHECK(err, "bpf_map_update_elem()", "error:%s\n",
strerror(errno));
}
ctx.start = false;
ctx.stop = false;
ctx.map_fd = map_fd;
ctx.loop = loop;
memcpy(&ctx.key, &key, sizeof(key));
for (i = 0; i < nr; i++) {
err = pthread_create(&tids[i], NULL, get_next_key_fn, &ctx);
if (err) {
abort_get_next_key(&ctx, tids, i);
CHECK(err, "pthread_create", "error %d\n", err);
}
}
ctx.start = true;
for (i = 0; i < nr; i++)
pthread_join(tids[i], NULL);
printf("%s:PASS\n", __func__);
close(map_fd);
}

View file

@ -15,6 +15,8 @@ int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign,
int *bpf_iter_bits_next(struct bpf_iter_bits *it) __ksym __weak;
void bpf_iter_bits_destroy(struct bpf_iter_bits *it) __ksym __weak;
u64 bits_array[511] = {};
SEC("iter.s/cgroup")
__description("bits iter without destroy")
__failure __msg("Unreleased reference")
@ -110,16 +112,16 @@ int bit_index(void)
}
SEC("syscall")
__description("bits nomem")
__description("bits too big")
__success __retval(0)
int bits_nomem(void)
int bits_too_big(void)
{
u64 data[4];
int nr = 0;
int *bit;
__builtin_memset(&data, 0xff, sizeof(data));
bpf_for_each(bits, bit, &data[0], 513) /* Be greater than 512 */
bpf_for_each(bits, bit, &data[0], 512) /* Be greater than 511 */
nr++;
return nr;
}
@ -151,3 +153,56 @@ int zero_words(void)
nr++;
return nr;
}
SEC("syscall")
__description("huge words")
__success __retval(0)
int huge_words(void)
{
u64 data[8] = {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1};
int nr = 0;
int *bit;
bpf_for_each(bits, bit, &data[0], 67108865)
nr++;
return nr;
}
SEC("syscall")
__description("max words")
__success __retval(4)
int max_words(void)
{
volatile int nr = 0;
int *bit;
bits_array[0] = (1ULL << 63) | 1U;
bits_array[510] = (1ULL << 33) | (1ULL << 32);
bpf_for_each(bits, bit, bits_array, 511) {
if (nr == 0 && *bit != 0)
break;
if (nr == 2 && *bit != 32672)
break;
nr++;
}
return nr;
}
SEC("syscall")
__description("bad words")
__success __retval(0)
int bad_words(void)
{
void *bad_addr = (void *)(3UL << 30);
int nr = 0;
int *bit;
bpf_for_each(bits, bit, bad_addr, 1)
nr++;
bpf_for_each(bits, bit, bad_addr, 4)
nr++;
return nr;
}

View file

@ -790,61 +790,6 @@ __naked static void cumulative_stack_depth_subprog(void)
:: __imm(bpf_get_smp_processor_id) : __clobber_all);
}
SEC("raw_tp")
__arch_x86_64
__log_level(4)
__msg("stack depth 512")
__xlated("0: r1 = 42")
__xlated("1: *(u64 *)(r10 -512) = r1")
__xlated("2: w0 = ")
__xlated("3: r0 = &(void __percpu *)(r0)")
__xlated("4: r0 = *(u32 *)(r0 +0)")
__xlated("5: exit")
__success
__naked int bpf_fastcall_max_stack_ok(void)
{
asm volatile(
"r1 = 42;"
"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
"call %[bpf_get_smp_processor_id];"
"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
"exit;"
:
: __imm_const(max_bpf_stack, MAX_BPF_STACK),
__imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
__imm(bpf_get_smp_processor_id)
: __clobber_all
);
}
SEC("raw_tp")
__arch_x86_64
__log_level(4)
__msg("stack depth 520")
__failure
__naked int bpf_fastcall_max_stack_fail(void)
{
asm volatile(
"r1 = 42;"
"*(u64 *)(r10 - %[max_bpf_stack]) = r1;"
"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
"call %[bpf_get_smp_processor_id];"
"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
/* call to prandom blocks bpf_fastcall rewrite */
"*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;"
"call %[bpf_get_prandom_u32];"
"r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);"
"exit;"
:
: __imm_const(max_bpf_stack, MAX_BPF_STACK),
__imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8),
__imm(bpf_get_smp_processor_id),
__imm(bpf_get_prandom_u32)
: __clobber_all
);
}
SEC("cgroup/getsockname_unix")
__xlated("0: r2 = 1")
/* bpf_cast_to_kern_ctx is replaced by a single assignment */

View file

@ -2,6 +2,7 @@
/* Converted from tools/testing/selftests/bpf/verifier/search_pruning.c */
#include <linux/bpf.h>
#include <../../../include/linux/filter.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
@ -336,4 +337,26 @@ l0_%=: r1 = 42; \
: __clobber_all);
}
/* Without checkpoint forcibly inserted at the back-edge a loop this
* test would take a very long time to verify.
*/
SEC("kprobe")
__failure __log_level(4)
__msg("BPF program is too large.")
__naked void short_loop1(void)
{
asm volatile (
" r7 = *(u16 *)(r1 +0);"
"1: r7 += 0x1ab064b9;"
" .8byte %[jset];" /* same as 'if r7 & 0x702000 goto 1b;' */
" r7 &= 0x1ee60e;"
" r7 += r1;"
" if r7 s> 0x37d2 goto +0;"
" r0 = 0;"
" exit;"
:
: __imm_insn(jset, BPF_JMP_IMM(BPF_JSET, BPF_REG_7, 0x702000, -2))
: __clobber_all);
}
char _license[] SEC("license") = "GPL";

View file

@ -15,3 +15,4 @@ test_usdt*
test_verif_scale*
test_xdp_noinline*
xdp_synproxy*
verifier_search_pruning*