Merge 504eaa2570 into 0fc810ae3a

x86/uaccess: Avoid barrier_nospec() in 64-bit copy_from_user()
The barrier_nospec() in 64-bit copy_from_user() is slow. Instead use pointer masking to force the user pointer to all 1's for an invalid address. The kernel test robot reports a 2.6% improvement in the per_thread_ops benchmark [1]. This is a variation on a patch originally by Josh Poimboeuf [2]. Link: https://lore.kernel.org/202410281344.d02c72a2-oliver.sang@intel.com [1] Link: https://lore.kernel.org/5b887fe4c580214900e21f6c61095adf9a142735.1730166635.git.jpoimboe@kernel.org [2] Tested-and-reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-11-01 04:53:36 +01:00 · 2024-10-30 18:52:55 -05:00 · 2024-10-30 11:38:10 -10:00 · 2024-10-30 11:17:47 -10:00 · 2024-10-30 08:16:23 -10:00 · 2024-10-29 16:41:30 -10:00
76 changed files with 776 additions and 439 deletions
--- a/Documentation/userspace-api/mseal.rst
+++ b/Documentation/userspace-api/mseal.rst
@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime.
 A similar feature already exists in the XNU kernel with the
 VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].

-User API
-========
-mseal()
-----------
-The mseal() syscall has the following signature:
+SYSCALL
+=======
+mseal syscall signature
+-----------------------
+   ``int mseal(void \* addr, size_t len, unsigned long flags)``

-``int mseal(void addr, size_t len, unsigned long flags)``
+   **addr**/**len**: virtual memory address range.
+      The address range set by **addr**/**len** must meet:
+         - The start address must be in an allocated VMA.
+         - The start address must be page aligned.
+         - The end address (**addr** + **len**) must be in an allocated VMA.
+         - no gap (unallocated memory) between start and end address.

-**addr/len**: virtual memory address range.
+      The ``len`` will be paged aligned implicitly by the kernel.

-The address range set by ``addr``/``len`` must meet:
-   - The start address must be in an allocated VMA.
-   - The start address must be page aligned.
-   - The end address (``addr`` + ``len``) must be in an allocated VMA.
-   - no gap (unallocated memory) between start and end address.
+   **flags**: reserved for future use.

-The ``len`` will be paged aligned implicitly by the kernel.
+   **Return values**:
+      - **0**: Success.
+      - **-EINVAL**:
+         * Invalid input ``flags``.
+         * The start address (``addr``) is not page aligned.
+         * Address range (``addr`` + ``len``) overflow.
+      - **-ENOMEM**:
+         * The start address (``addr``) is not allocated.
+         * The end address (``addr`` + ``len``) is not allocated.
+         * A gap (unallocated memory) between start and end address.
+      - **-EPERM**:
+         * sealing is supported only on 64-bit CPUs, 32-bit is not supported.

-**flags**: reserved for future use.
+   **Note about error return**:
+      - For above error cases, users can expect the given memory range is
+        unmodified, i.e. no partial update.
+      - There might be other internal errors/cases not listed here, e.g.
+        error during merging/splitting VMAs, or the process reaching the maximum
+        number of supported VMAs. In those cases, partial updates to the given
+        memory range could happen. However, those cases should be rare.

-**return values**:
+   **Architecture support**:
+      mseal only works on 64-bit CPUs, not 32-bit CPUs.

- ``0``: Success.
+   **Idempotent**:
+      users can call mseal multiple times. mseal on an already sealed memory
+      is a no-action (not error).

- ``-EINVAL``:
-    - Invalid input ``flags``.
-    - The start address (``addr``) is not page aligned.
-    - Address range (``addr`` + ``len``) overflow.
+   **no munseal**
+      Once mapping is sealed, it can't be unsealed. The kernel should never
+      have munseal, this is consistent with other sealing feature, e.g.
+      F_SEAL_SEAL for file.

- ``-ENOMEM``:
-    - The start address (``addr``) is not allocated.
-    - The end address (``addr`` + ``len``) is not allocated.
-    - A gap (unallocated memory) between start and end address.
+Blocked mm syscall for sealed mapping
+-------------------------------------
+   It might be important to note: **once the mapping is sealed, it will
+   stay in the process's memory until the process terminates**.

- ``-EPERM``:
-    - sealing is supported only on 64-bit CPUs, 32-bit is not supported.
+   Example::

- For above error cases, users can expect the given memory range is
-  unmodified, i.e. no partial update.
+         *ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+         rc = mseal(ptr, 4096, 0);
+         /* munmap will fail */
+         rc = munmap(ptr, 4096);
+         assert(rc < 0);

- There might be other internal errors/cases not listed here, e.g.
-  error during merging/splitting VMAs, or the process reaching the max
-  number of supported VMAs. In those cases, partial updates to the given
-  memory range could happen. However, those cases should be rare.
+   Blocked mm syscall:
+      - munmap
+      - mmap
+      - mremap
+      - mprotect and pkey_mprotect
+      - some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE,
+        MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK

-**Blocked operations after sealing**:
-    Unmapping, moving to another location, and shrinking the size,
-    via munmap() and mremap(), can leave an empty space, therefore
-    can be replaced with a VMA with a new set of attributes.
+   The first set of syscalls to block is munmap, mremap, mmap. They can
+   either leave an empty space in the address space, therefore allowing
+   replacement with a new mapping with new set of attributes, or can
+   overwrite the existing mapping with another mapping.

-    Moving or expanding a different VMA into the current location,
-    via mremap().
+   mprotect and pkey_mprotect are blocked because they changes the
+   protection bits (RWX) of the mapping.

-    Modifying a VMA via mmap(MAP_FIXED).
+   Certain destructive madvise behaviors, specifically MADV_DONTNEED,
+   MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce
+   risks when applied to anonymous memory by threads lacking write
+   permissions. Consequently, these operations are prohibited under such
+   conditions. The aforementioned behaviors have the potential to modify
+   region contents by discarding pages, effectively performing a memset(0)
+   operation on the anonymous memory.

-    Size expansion, via mremap(), does not appear to pose any
-    specific risks to sealed VMAs. It is included anyway because
-    the use case is unclear. In any case, users can rely on
-    merging to expand a sealed VMA.
+   Kernel will return -EPERM for blocked syscalls.

-    mprotect() and pkey_mprotect().
+   When blocked syscall return -EPERM due to sealing, the memory regions may
+   or may not be changed, depends on the syscall being blocked:

-    Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
-    for anonymous memory, when users don't have write permission to the
-    memory. Those behaviors can alter region contents by discarding pages,
-    effectively a memset(0) for anonymous memory.
+      - munmap: munmap is atomic. If one of VMAs in the given range is
+        sealed, none of VMAs are updated.
+      - mprotect, pkey_mprotect, madvise: partial update might happen, e.g.
+        when mprotect over multiple VMAs, mprotect might update the beginning
+        VMAs before reaching the sealed VMA and return -EPERM.
+      - mmap and mremap: undefined behavior.

-    Kernel will return -EPERM for blocked operations.
-
-    For blocked operations, one can expect the given address is unmodified,
-    i.e. no partial update. Note, this is different from existing mm
-    system call behaviors, where partial updates are made till an error is
-    found and returned to userspace. To give an example:
-
-    Assume following code sequence:
-
-    - ptr = mmap(null, 8192, PROT_NONE);
-    - munmap(ptr + 4096, 4096);
-    - ret1 = mprotect(ptr, 8192, PROT_READ);
-    - mseal(ptr, 4096);
-    - ret2 = mprotect(ptr, 8192, PROT_NONE);
-
-    ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
-
-    ret2 will be -EPERM, the page remains to be PROT_READ.
-
-**Note**:
-
- mseal() only works on 64-bit CPUs, not 32-bit CPU.
-
- users can call mseal() multiple times, mseal() on an already sealed memory
-  is a no-action (not error).
-
- munseal() is not supported.
-
-Use cases:
-==========
+Use cases
+=========
 - glibc:
  The dynamic linker, during loading ELF executables, can apply sealing to
-  non-writable memory segments.
+  mapping segments.

- Chrome browser: protect some security sensitive data-structures.
+- Chrome browser: protect some security sensitive data structures.

-Notes on which memory to seal:
-==============================
-
-It might be important to note that sealing changes the lifetime of a mapping,
-i.e. the sealed mapping won’t be unmapped till the process terminates or the
-exec system call is invoked. Applications can apply sealing to any virtual
-memory region from userspace, but it is crucial to thoroughly analyze the
-mapping's lifetime prior to apply the sealing.
+When not to use mseal
+=====================
+Applications can apply sealing to any virtual memory region from userspace,
+but it is *crucial to thoroughly analyze the mapping's lifetime* prior to
+apply the sealing. This is because the sealed mapping *won’t be unmapped*
+until the process terminates or the exec system call is invoked.

 For example:
+   - aio/shm
+     aio/shm can call mmap and  munmap on behalf of userspace, e.g.
+     ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to
+     the lifetime of the process. If those memories are sealed from userspace,
+     then munmap will fail, causing leaks in VMA address space during the
+     lifetime of the process.

- aio/shm
+   - ptr allocated by malloc (heap)
+     Don't use mseal on the memory ptr return from malloc().
+     malloc() is implemented by allocator, e.g. by glibc. Heap manager might
+     allocate a ptr from brk or mapping created by mmap.
+     If an app calls mseal on a ptr returned from malloc(), this can affect
+     the heap manager's ability to manage the mappings; the outcome is
+     non-deterministic.

-  aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
-  shm.c. The lifetime of those mapping are not tied to the lifetime of the
-  process. If those memories are sealed from userspace, then munmap() will fail,
-  causing leaks in VMA address space during the lifetime of the process.
+     Example::

- Brk (heap)
+        ptr = malloc(size);
+        /* don't call mseal on ptr return from malloc. */
+        mseal(ptr, size);
+        /* free will success, allocator can't shrink heap lower than ptr */
+        free(ptr);

-  Currently, userspace applications can seal parts of the heap by calling
-  malloc() and mseal().
-  let's assume following calls from user space:
+mseal doesn't block
+===================
+In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's
+attributes, such as protection bits (RWX). Sealed mappings doesn't mean the
+memory is immutable.

-  - ptr = malloc(size);
-  - mprotect(ptr, size, RO);
-  - mseal(ptr, size);
-  - free(ptr);
-
-  Technically, before mseal() is added, the user can change the protection of
-  the heap by calling mprotect(RO). As long as the user changes the protection
-  back to RW before free(), the memory range can be reused.
-
-  Adding mseal() into the picture, however, the heap is then sealed partially,
-  the user can still free it, but the memory remains to be RO. If the address
-  is re-used by the heap manager for another malloc, the process might crash
-  soon after. Therefore, it is important not to apply sealing to any memory
-  that might get recycled.
-
-  Furthermore, even if the application never calls the free() for the ptr,
-  the heap manager may invoke the brk system call to shrink the size of the
-  heap. In the kernel, the brk-shrink will call munmap(). Consequently,
-  depending on the location of the ptr, the outcome of brk-shrink is
-  nondeterministic.
-
-
-Additional notes:
-=================
 As Jann Horn pointed out in [3], there are still a few ways to write
-to RO memory, which is, in a way, by design. Those cases are not covered
-by mseal(). If applications want to block such cases, sandbox tools (such as
-seccomp, LSM, etc) might be considered.
+to RO memory, which is, in a way, by design. And those could be blocked
+by different security measures.

 Those cases are:

- Write to read-only memory through /proc/self/mem interface.
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
- userfaultfd.
+   - Write to read-only memory through /proc/self/mem interface (FOLL_FORCE).
+   - Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
+   - userfaultfd.

 The idea that inspired this patch comes from Stephen Röttger’s work in V8
 CFI [4]. Chrome browser in ChromeOS will be the first user of this API.

-Reference:
-==========
-[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
-
-[2] https://man.openbsd.org/mimmutable.2
-
-[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
-
-[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
+Reference
+=========
+- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
+- [2] https://man.openbsd.org/mimmutable.2
+- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
+- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
--- a/4
+++ b/4
@ -11,8 +11,10 @@ In order to build the documentation, use ``make htmldocs`` or
    https://www.kernel.org/doc/html/latest/

 There are various text files in the Documentation/ subdirectory,
-several of them using the reStructuredText markup notation.
+several of them using reStructuredText markup notation.

 Please read the Documentation/process/changes.rst file, as it contains the
 requirements for building and running the kernel, and information about
 the problems which may result by upgrading your kernel.
+
+
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs)
 	int ud_type;
 	u32 imm;

-	/*
-	 * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
-	 * is a rare case that uses @regs without passing them to
-	 * irqentry_enter().
-	 */
-	kmsan_unpoison_entry_regs(regs);
 	ud_type = decode_bug(regs->ip, &imm);
 	if (ud_type == BUG_NONE)
 		return handled;
@ -275,6 +269,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
 	 * All lies, just get the WARN/BUG out.
 	 */
 	instrumentation_begin();
+	/*
+	 * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+	 * is a rare case that uses @regs without passing them to
+	 * irqentry_enter().
+	 */
+	kmsan_unpoison_entry_regs(regs);
 	/*
 	 * Since we're emulating a CALL with exceptions, restore the interrupt
 	 * state to what it was at the exception site.
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@ -674,6 +674,16 @@ EXPORT_SYMBOL_GPL(tpm_chip_register);
 */
 void tpm_chip_unregister(struct tpm_chip *chip)
 {
+#ifdef CONFIG_TCG_TPM2_HMAC
+	int rc;
+
+	rc = tpm_try_get_ops(chip);
+	if (!rc) {
+		tpm2_end_auth_session(chip);
+		tpm_put_ops(chip);
+	}
+#endif
+
 	tpm_del_legacy_sysfs(chip);
 	if (tpm_is_hwrng_enabled(chip))
 		hwrng_unregister(&chip->hwrng);
--- a/drivers/char/tpm/tpm-dev-common.c
+++ b/drivers/char/tpm/tpm-dev-common.c
@ -27,6 +27,9 @@ static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space,
 	struct tpm_header *header = (void *)buf;
 	ssize_t ret, len;

+	if (chip->flags & TPM_CHIP_FLAG_TPM2)
+		tpm2_end_auth_session(chip);
+
 	ret = tpm2_prepare_space(chip, space, buf, bufsiz);
 	/* If the command is not implemented by the TPM, synthesize a
 	 * response with a TPM2_RC_COMMAND_CODE return for user-space.
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@ -379,10 +379,12 @@ int tpm_pm_suspend(struct device *dev)

 	rc = tpm_try_get_ops(chip);
 	if (!rc) {
-		if (chip->flags & TPM_CHIP_FLAG_TPM2)
+		if (chip->flags & TPM_CHIP_FLAG_TPM2) {
+			tpm2_end_auth_session(chip);
 			tpm2_shutdown(chip, TPM2_SU_STATE);
-		else
+		} else {
 			rc = tpm1_pm_suspend(chip, tpm_suspend_pcr);
+		}

 		tpm_put_ops(chip);
 	}
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@ -333,6 +333,9 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 	}

 #ifdef CONFIG_TCG_TPM2_HMAC
+	/* The first write to /dev/tpm{rm0} will flush the session. */
+	attributes |= TPM2_SA_CONTINUE_SESSION;
+
 	/*
 	 * The Architecture Guide requires us to strip trailing zeros
 	 * before computing the HMAC
@ -484,7 +487,8 @@ static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v,
 	sha256_final(&sctx, out);
 }

-static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
+static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip,
+				struct tpm2_auth *auth)
 {
 	struct crypto_kpp *kpp;
 	struct kpp_request *req;
@ -543,7 +547,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
 	sg_set_buf(&s[0], chip->null_ec_key_x, EC_PT_SZ);
 	sg_set_buf(&s[1], chip->null_ec_key_y, EC_PT_SZ);
 	kpp_request_set_input(req, s, EC_PT_SZ*2);
-	sg_init_one(d, chip->auth->salt, EC_PT_SZ);
+	sg_init_one(d, auth->salt, EC_PT_SZ);
 	kpp_request_set_output(req, d, EC_PT_SZ);
 	crypto_kpp_compute_shared_secret(req);
 	kpp_request_free(req);
@ -554,8 +558,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
 	 * This works because KDFe fully consumes the secret before it
 	 * writes the salt
 	 */
-	tpm2_KDFe(chip->auth->salt, "SECRET", x, chip->null_ec_key_x,
-		  chip->auth->salt);
+	tpm2_KDFe(auth->salt, "SECRET", x, chip->null_ec_key_x, auth->salt);

 out:
 	crypto_free_kpp(kpp);
@ -853,7 +856,9 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
 		if (rc)
 			/* manually close the session if it wasn't consumed */
 			tpm2_flush_context(chip, auth->handle);
-		memzero_explicit(auth, sizeof(*auth));
+
+		kfree_sensitive(auth);
+		chip->auth = NULL;
 	} else {
 		/* reset for next use  */
 		auth->session = TPM_HEADER_SIZE;
@ -881,7 +886,8 @@ void tpm2_end_auth_session(struct tpm_chip *chip)
 		return;

 	tpm2_flush_context(chip, auth->handle);
-	memzero_explicit(auth, sizeof(*auth));
+	kfree_sensitive(auth);
+	chip->auth = NULL;
 }
 EXPORT_SYMBOL(tpm2_end_auth_session);

@ -915,33 +921,37 @@ static int tpm2_parse_start_auth_session(struct tpm2_auth *auth,

 static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
 {
-	int rc;
 	unsigned int offset = 0; /* dummy offset for null seed context */
 	u8 name[SHA256_DIGEST_SIZE + 2];
+	u32 tmp_null_key;
+	int rc;

 	rc = tpm2_load_context(chip, chip->null_key_context, &offset,
-			       null_key);
-	if (rc != -EINVAL)
-		return rc;
+			       &tmp_null_key);
+	if (rc != -EINVAL) {
+		if (!rc)
+			*null_key = tmp_null_key;
+		goto err;
+	}

-	/* an integrity failure may mean the TPM has been reset */
-	dev_err(&chip->dev, "NULL key integrity failure!\n");
-	/* check the null name against what we know */
-	tpm2_create_primary(chip, TPM2_RH_NULL, NULL, name);
-	if (memcmp(name, chip->null_key_name, sizeof(name)) == 0)
-		/* name unchanged, assume transient integrity failure */
-		return rc;
-	/*
-	 * Fatal TPM failure: the NULL seed has actually changed, so
-	 * the TPM must have been illegally reset.  All in-kernel TPM
-	 * operations will fail because the NULL primary can't be
-	 * loaded to salt the sessions, but disable the TPM anyway so
-	 * userspace programmes can't be compromised by it.
-	 */
-	dev_err(&chip->dev, "NULL name has changed, disabling TPM due to interference\n");
+	/* Try to re-create null key, given the integrity failure: */
+	rc = tpm2_create_primary(chip, TPM2_RH_NULL, &tmp_null_key, name);
+	if (rc)
+		goto err;
+
+	/* Return null key if the name has not been changed: */
+	if (!memcmp(name, chip->null_key_name, sizeof(name))) {
+		*null_key = tmp_null_key;
+		return 0;
+	}
+
+	/* Deduce from the name change TPM interference: */
+	dev_err(&chip->dev, "null key integrity check failed\n");
+	tpm2_flush_context(chip, tmp_null_key);
 	chip->flags |= TPM_CHIP_FLAG_DISABLE;

-	return rc;
+err:
+	return rc ? -ENODEV : 0;
 }

 /**
@ -958,16 +968,20 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
 */
 int tpm2_start_auth_session(struct tpm_chip *chip)
 {
+	struct tpm2_auth *auth;
 	struct tpm_buf buf;
-	struct tpm2_auth *auth = chip->auth;
-	int rc;
 	u32 null_key;
+	int rc;

-	if (!auth) {
-		dev_warn_once(&chip->dev, "auth session is not active\n");
+	if (chip->auth) {
+		dev_warn_once(&chip->dev, "auth session is active\n");
 		return 0;
 	}

+	auth = kzalloc(sizeof(*auth), GFP_KERNEL);
+	if (!auth)
+		return -ENOMEM;
+
 	rc = tpm2_load_null(chip, &null_key);
 	if (rc)
 		goto out;
@ -988,7 +1002,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
 	tpm_buf_append(&buf, auth->our_nonce, sizeof(auth->our_nonce));

 	/* append encrypted salt and squirrel away unencrypted in auth */
-	tpm_buf_append_salt(&buf, chip);
+	tpm_buf_append_salt(&buf, chip, auth);
 	/* session type (HMAC, audit or policy) */
 	tpm_buf_append_u8(&buf, TPM2_SE_HMAC);

@ -1010,10 +1024,13 @@ int tpm2_start_auth_session(struct tpm_chip *chip)

 	tpm_buf_destroy(&buf);

-	if (rc)
-		goto out;
+	if (rc == TPM2_RC_SUCCESS) {
+		chip->auth = auth;
+		return 0;
+	}

- out:
+out:
+	kfree_sensitive(auth);
 	return rc;
 }
 EXPORT_SYMBOL(tpm2_start_auth_session);
@ -1347,18 +1364,21 @@ static int tpm2_create_null_primary(struct tpm_chip *chip)
 *
 * Derive and context save the null primary and allocate memory in the
 * struct tpm_chip for the authorizations.
+ *
+ * Return:
+ * * 0		- OK
+ * * -errno	- A system error
+ * * TPM_RC	- A TPM error
 */
 int tpm2_sessions_init(struct tpm_chip *chip)
 {
 	int rc;

 	rc = tpm2_create_null_primary(chip);
-	if (rc)
-		dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
-
-	chip->auth = kmalloc(sizeof(*chip->auth), GFP_KERNEL);
-	if (!chip->auth)
-		return -ENOMEM;
+	if (rc) {
+		dev_err(&chip->dev, "null key creation failed with %d\n", rc);
+		return rc;
+	}

 	return rc;
 }
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@ -3651,7 +3651,7 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
 	enum dma_data_direction dir;
 	struct scsi_data_buffer *sdb = &scp->sdb;
 	u8 *fsp;
-	int i;
+	int i, total = 0;

 	/*
 	 * Even though reads are inherently atomic (in this driver), we expect
@ -3688,18 +3688,16 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
 		   fsp + (block * sdebug_sector_size),
 		   sdebug_sector_size, sg_skip, do_write);
 		sdeb_data_sector_unlock(sip, do_write);
-		if (ret != sdebug_sector_size) {
-			ret += (i * sdebug_sector_size);
+		total += ret;
+		if (ret != sdebug_sector_size)
 			break;
-		}
 		sg_skip += sdebug_sector_size;
 		if (++block >= sdebug_store_sectors)
 			block = 0;
 	}
-	ret = num * sdebug_sector_size;
 	sdeb_data_unlock(sip, atomic);

-	return ret;
+	return total;
 }

 /* Returns number of bytes copied or -1 if error. */
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@ -8219,7 +8219,7 @@ static void ufshcd_update_rtc(struct ufs_hba *hba)

 	err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR, QUERY_ATTR_IDN_SECONDS_PASSED,
 				0, 0, &val);
-	ufshcd_rpm_put_sync(hba);
+	ufshcd_rpm_put(hba);

 	if (err)
 		dev_err(hba->dev, "%s: Failed to update rtc %d\n", __func__, err);
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)

 	folio_clear_uptodate(folio);
 	folio_clear_mappedtodisk(folio);
+	folio_clear_checked(folio);

 	head = folio_buffers(folio);
 	if (head) {
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
 		return 0;

 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
+
+		if (byte_start > id_count || byte_start + byte_len > id_count) {
+			ret = -EINVAL;
+			mlog_errno(ret);
+			goto out;
+		}
+
 		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
 					    byte_start + byte_len, 0);
 		if (ret) {
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
 	}
 }

+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	/*
+	 * An error has occurred on fork, we will tear memory down, but have
+	 * allocated memory for fctx's and raised reference counts for both the
+	 * original and child contexts (and on the mm for each as a result).
+	 *
+	 * These would ordinarily be taken care of by a user handling the event,
+	 * but we are no longer doing so, so manually clean up here.
+	 *
+	 * mm tear down will take care of cleaning up VMA contexts.
+	 */
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		struct userfaultfd_ctx *octx = fctx->orig;
+		struct userfaultfd_ctx *ctx = fctx->new;
+
+		atomic_dec(&octx->mmap_changing);
+		VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+		userfaultfd_ctx_put(octx);
+		userfaultfd_ctx_put(ctx);
+
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 			     struct vm_userfaultfd_ctx *vm_ctx)
 {
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
 	return atomic_long_read(&mm->ksm_zero_pages);
 }

-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
+	/* Adding mm to ksm is best effort on fork. */
 	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
-
-	return 0;
+		__ksm_enter(mm);
 }

 static inline int ksm_execve(struct mm_struct *mm)
@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm)
 	return 0;
 }

-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	return 0;
 }

 static inline int ksm_execve(struct mm_struct *mm)
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@ -38,6 +38,7 @@
 #else
 #define can_do_masked_user_access() 0
 #define masked_user_access_begin(src) NULL
+ #define mask_user_address(src) (src)
 #endif

 /*
@ -159,19 +160,27 @@ _inline_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
 	might_fault();
-	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
+	if (should_fail_usercopy())
+		goto fail;
+	if (can_do_masked_user_access())
+		from = mask_user_address(from);
+	else {
+		if (!access_ok(from, n))
+			goto fail;
 		/*
 		 * Ensure that bad access_ok() speculation will not
 		 * lead to nasty side effects *after* the copy is
 		 * finished:
 		 */
 		barrier_nospec();
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user(to, from, n);
-		instrument_copy_from_user_after(to, from, n, res);
 	}
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
+	instrument_copy_from_user_before(to, from, n);
+	res = raw_copy_from_user(to, from, n);
+	instrument_copy_from_user_after(to, from, n, res);
+	if (likely(!res))
+		return 0;
+fail:
+	memset(to + (n - res), 0, res);
 	return res;
 }
 extern __must_check unsigned long
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,

 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
+void dup_userfaultfd_fail(struct list_head *);

 extern void mremap_userfaultfd_prep(struct vm_area_struct *,
 				    struct vm_userfaultfd_ctx *);
@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
 {
 }

+static inline void dup_userfaultfd_fail(struct list_head *l)
+{
+}
+
 static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 					   struct vm_userfaultfd_ctx *ctx)
 {
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@ -24,6 +24,23 @@
 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);

+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+	cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+	if (!cgroup_bpf_destroy_wq)
+		panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+	return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
 /* __always_inline is necessary to prevent indirect call through run_prog
 * function pointer.
 */
@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);

 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
-	queue_work(system_wq, &cgrp->bpf.release_work);
+	queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
 }

 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
 {
 	struct cgroup *cgroup;
 	int ret = false;
-	int level = 1;
+	int level = 0;

 	lockdep_assert_held(&cgroup_mutex);

@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
 		if (cgroup->nr_descendants >= cgroup->max_descendants)
 			goto fail;

-		if (level > cgroup->max_depth)
+		if (level >= cgroup->max_depth)
 			goto fail;

 		level++;
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	mm->exec_vm = oldmm->exec_vm;
 	mm->stack_vm = oldmm->stack_vm;

-	retval = ksm_fork(mm, oldmm);
-	if (retval)
-		goto out;
-	khugepaged_fork(mm, oldmm);
-
 	/* Use __mt_dup() to efficiently build an identical maple tree. */
 	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
 	if (unlikely(retval))
@ -760,6 +755,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	vma_iter_free(&vmi);
 	if (!retval) {
 		mt_set_in_rcu(vmi.mas.tree);
+		ksm_fork(mm, oldmm);
+		khugepaged_fork(mm, oldmm);
 	} else if (mpnt) {
 		/*
 		 * The entire maple tree has already been duplicated. If the
@ -775,7 +772,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
 	mmap_write_unlock(oldmm);
-	dup_userfaultfd_complete(&uf);
+	if (!retval)
+		dup_userfaultfd_complete(&uf);
+	else
+		dup_userfaultfd_fail(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
--- a/kernel/resource.c
+++ b/kernel/resource.c
@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
 			rams_size += 16;
 		}

-		rams[i].start = res.start;
-		rams[i++].end = res.end;
-
+		rams[i++] = res;
 		start = res.end + 1;
 	}

--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
-static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static int scx_ops_bypass_depth;
+static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
 static bool scx_ops_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@ -4298,18 +4299,20 @@ bool task_should_scx(struct task_struct *p)
 */
 static void scx_ops_bypass(bool bypass)
 {
-	int depth, cpu;
+	int cpu;
+	unsigned long flags;

+	raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
 	if (bypass) {
-		depth = atomic_inc_return(&scx_ops_bypass_depth);
-		WARN_ON_ONCE(depth <= 0);
-		if (depth != 1)
-			return;
+		scx_ops_bypass_depth++;
+		WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
+		if (scx_ops_bypass_depth != 1)
+			goto unlock;
 	} else {
-		depth = atomic_dec_return(&scx_ops_bypass_depth);
-		WARN_ON_ONCE(depth < 0);
-		if (depth != 0)
-			return;
+		scx_ops_bypass_depth--;
+		WARN_ON_ONCE(scx_ops_bypass_depth < 0);
+		if (scx_ops_bypass_depth != 0)
+			goto unlock;
 	}

 	/*
@ -4326,7 +4329,7 @@ static void scx_ops_bypass(bool bypass)
 		struct rq_flags rf;
 		struct task_struct *p, *n;

-		rq_lock_irqsave(rq, &rf);
+		rq_lock(rq, &rf);

 		if (bypass) {
 			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@ -4362,11 +4365,13 @@ static void scx_ops_bypass(bool bypass)
 			sched_enq_and_set_task(&ctx);
 		}

-		rq_unlock_irqrestore(rq, &rf);
+		rq_unlock(rq, &rf);

 		/* resched to restore ticks and idle state */
 		resched_cpu(cpu);
 	}
+unlock:
+	raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
 }

 static void free_exit_info(struct scx_exit_info *ei)
--- a/lib/slub_kunit.c
+++ b/lib/slub_kunit.c
@ -141,7 +141,7 @@ static void test_kmalloc_redzone_access(struct kunit *test)
 {
 	struct kmem_cache *s = test_kmem_cache_create("TestSlub_RZ_kmalloc", 32,
 				SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE);
-	u8 *p = __kmalloc_cache_noprof(s, GFP_KERNEL, 18);
+	u8 *p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 18));

 	kasan_disable_current();

--- a/mm/Kconfig
+++ b/mm/Kconfig
@ -1085,7 +1085,6 @@ config HMM_MIRROR
 	depends on MMU

 config GET_FREE_REGION
-	depends on SPARSEMEM
 	bool

 config DEVICE_PRIVATE
--- a/mm/memory.c
+++ b/mm/memory.c
@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
 /*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *swapcache, *folio = NULL;
+	DECLARE_WAITQUEUE(wait, current);
 	struct page *page;
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 					 * Relax a bit to prevent rapid
 					 * repeated page faults.
 					 */
+					add_wait_queue(&swapcache_wq, &wait);
 					schedule_timeout_uninterruptible(1);
+					remove_wait_queue(&swapcache_wq, &wait);
 					goto out_page;
 				}
 				need_clear_cache = true;
@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
 	/* Clear the swap cache pin for direct swapin after PTL unlock */
-	if (need_clear_cache)
+	if (need_clear_cache) {
 		swapcache_clear(si, entry, nr_pages);
+		if (waitqueue_active(&swapcache_wq))
+			wake_up(&swapcache_wq);
+	}
 	if (si)
 		put_swap_device(si);
 	return ret;
@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		folio_unlock(swapcache);
 		folio_put(swapcache);
 	}
-	if (need_clear_cache)
+	if (need_clear_cache) {
 		swapcache_clear(si, entry, nr_pages);
+		if (waitqueue_active(&swapcache_wq))
+			wake_up(&swapcache_wq);
+	}
 	if (si)
 		put_swap_device(si);
 	return ret;
--- a/mm/mmap.c
+++ b/mm/mmap.c
@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vmg.flags = vm_flags;
 	}

+	/*
+	 * clear PTEs while the vma is still in the tree so that rmap
+	 * cannot race with the freeing later in the truncate scenario.
+	 * This is also needed for call_mmap(), which is why vm_ops
+	 * close function is called.
+	 */
+	vms_clean_up_area(&vms, &mas_detach);
 	vma = vma_merge_new_range(&vmg);
 	if (vma)
 		goto expanded;
@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

 	if (file) {
 		vma->vm_file = get_file(file);
-		/*
-		 * call_mmap() may map PTE, so ensure there are no existing PTEs
-		 * and call the vm_ops close function if one exists.
-		 */
-		vms_clean_up_area(&vms, &mas_detach);
 		error = call_mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long populate = 0;
 	unsigned long ret = -EINVAL;
 	struct file *file;
+	vm_flags_t vm_flags;

 	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
 		     current->comm, current->pid);
@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
 		return ret;

-	if (mmap_write_lock_killable(mm))
+	if (mmap_read_lock_killable(mm))
 		return -EINTR;

+	/*
+	 * Look up VMA under read lock first so we can perform the security
+	 * without holding locks (which can be problematic). We reacquire a
+	 * write lock later and check nothing changed underneath us.
+	 */
+	vma = vma_lookup(mm, start);
+
+	if (!vma || !(vma->vm_flags & VM_SHARED)) {
+		mmap_read_unlock(mm);
+		return -EINVAL;
+	}
+
+	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+	flags &= MAP_NONBLOCK;
+	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+	if (vma->vm_flags & VM_LOCKED)
+		flags |= MAP_LOCKED;
+
+	/* Save vm_flags used to calculate prot and flags, and recheck later. */
+	vm_flags = vma->vm_flags;
+	file = get_file(vma->vm_file);
+
+	mmap_read_unlock(mm);
+
+	/* Call outside mmap_lock to be consistent with other callers. */
+	ret = security_mmap_file(file, prot, flags);
+	if (ret) {
+		fput(file);
+		return ret;
+	}
+
+	ret = -EINVAL;
+
+	/* OK security check passed, take write lock + let it rip. */
+	if (mmap_write_lock_killable(mm)) {
+		fput(file);
+		return -EINTR;
+	}
+
 	vma = vma_lookup(mm, start);

-	if (!vma || !(vma->vm_flags & VM_SHARED))
+	if (!vma)
+		goto out;
+
+	/* Make sure things didn't change under us. */
+	if (vma->vm_flags != vm_flags)
+		goto out;
+	if (vma->vm_file != file)
 		goto out;

 	if (start + size > vma->vm_end) {
@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			goto out;
 	}

-	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
-	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
-	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
-
-	flags &= MAP_NONBLOCK;
-	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
-	if (vma->vm_flags & VM_LOCKED)
-		flags |= MAP_LOCKED;
-
-	file = get_file(vma->vm_file);
-	ret = security_mmap_file(vma->vm_file, prot, flags);
-	if (ret)
-		goto out_fput;
 	ret = do_mmap(vma->vm_file, start, size,
 			prot, flags, 0, pgoff, &populate, NULL);
-out_fput:
-	fput(file);
 out:
 	mmap_write_unlock(mm);
+	fput(file);
 	if (populate)
 		mm_populate(ret, populate);
 	if (!IS_ERR_VALUE(ret))
@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));

 		vmg.prev = vma;
-		vma_iter_next_range(vmi);
+		/* vmi is positioned at prev, which this mode expects. */
+		vmg.merge_flags = VMG_FLAG_JUST_EXPAND;

 		if (vma_merge_new_range(&vmg))
 			goto out;
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
 	for_each_reserved_mem_region(mb_region) {
 		int nid = memblock_get_region_node(mb_region);

-		if (nid != MAX_NUMNODES)
+		if (numa_valid_node(nid))
 			node_set(nid, reserved_nodemask);
 	}

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 			page = __rmqueue(zone, order, migratetype, alloc_flags);

 			/*
-			 * If the allocation fails, allow OOM handling access
-			 * to HIGHATOMIC reserves as failing now is worse than
-			 * failing a high-order atomic allocation in the
-			 * future.
+			 * If the allocation fails, allow OOM handling and
+			 * order-0 (atomic) allocs access to HIGHATOMIC
+			 * reserves as failing now is worse than failing a
+			 * high-order atomic allocation in the future.
 			 */
-			if (!page && (alloc_flags & ALLOC_OOM))
+			if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
 				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);

 			if (!page) {
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 	pud = pudp_get(pudp);
 	if (pud_none(pud))
 		goto not_found;
-	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
+	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
+	    (!pud_present(pud) || pud_leaf(pud))) {
 		ptl = pud_lock(vma->vm_mm, pudp);
 		pud = pudp_get(pudp);

@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 		fw->pudp = pudp;
 		fw->pud = pud;

+		/*
+		 * TODO: FW_MIGRATION support for PUD migration entries
+		 * once there are relevant users.
+		 */
 		if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
 			spin_unlock(ptl);
 			goto not_found;
@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 	}

 pmd_table:
-	VM_WARN_ON_ONCE(pud_leaf(*pudp));
+	VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
 	pmdp = pmd_offset(pudp, addr);
 	pmd = pmdp_get_lockless(pmdp);
 	if (pmd_none(pmd))
 		goto not_found;
-	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
+	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
+	    (!pmd_present(pmd) || pmd_leaf(pmd))) {
 		ptl = pmd_lock(vma->vm_mm, pmdp);
 		pmd = pmdp_get(pmdp);

@ -786,7 +792,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 		if (pmd_none(pmd)) {
 			spin_unlock(ptl);
 			goto not_found;
-		} else if (!pmd_leaf(pmd)) {
+		} else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
 			spin_unlock(ptl);
 			goto pte_table;
 		} else if (pmd_present(pmd)) {
@ -812,7 +818,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 	}

 pte_table:
-	VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
+	VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
 	ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	if (!ptep)
 		goto not_found;
--- a/mm/shmem.c
+++ b/mm/shmem.c
@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
 			STATX_ATTR_IMMUTABLE |
 			STATX_ATTR_NODUMP);
+	inode_lock_shared(inode);
 	generic_fillattr(idmap, request_mask, inode, stat);
+	inode_unlock_shared(inode);

 	if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
 		stat->blksize = HPAGE_PMD_SIZE;
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@ -1209,7 +1209,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
 		/* Zero out spare memory. */
 		if (want_init_on_alloc(flags)) {
 			kasan_disable_current();
-			memset((void *)p + new_size, 0, ks - new_size);
+			memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
 			kasan_enable_current();
 		}

--- a/mm/vma.c
+++ b/mm/vma.c
@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	pgoff_t pgoff = vmg->pgoff;
 	pgoff_t pglen = PHYS_PFN(end - start);
 	bool can_merge_left, can_merge_right;
+	bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;

 	mmap_assert_write_locked(vmg->mm);
 	VM_WARN_ON(vmg->vma);
@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 		return NULL;

 	can_merge_left = can_vma_merge_left(vmg);
-	can_merge_right = can_vma_merge_right(vmg, can_merge_left);
+	can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);

 	/* If we can merge with the next VMA, adjust vmg accordingly. */
 	if (can_merge_right) {
@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 		if (can_merge_right && !can_merge_remove_vma(next))
 			vmg->end = end;

-		vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
+		/* In expand-only case we are already positioned at prev. */
+		if (!just_expand) {
+			/* Equivalent to going to the previous range. */
+			vma_prev(vmg->vmi);
+		}
 	}

 	/*
@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	}

 	/* If expansion failed, reset state. Allows us to retry merge later. */
-	vmg->vma = NULL;
-	vmg->start = start;
-	vmg->end = end;
-	vmg->pgoff = pgoff;
-	if (vmg->vma == prev)
-		vma_iter_set(vmg->vmi, start);
+	if (!just_expand) {
+		vmg->vma = NULL;
+		vmg->start = start;
+		vmg->end = end;
+		vmg->pgoff = pgoff;
+		if (vmg->vma == prev)
+			vma_iter_set(vmg->vmi, start);
+	}

 	return NULL;
 }
--- a/mm/vma.h
+++ b/mm/vma.h
@ -59,6 +59,17 @@ enum vma_merge_state {
 	VMA_MERGE_SUCCESS,
 };

+enum vma_merge_flags {
+	VMG_FLAG_DEFAULT = 0,
+	/*
+	 * If we can expand, simply do so. We know there is nothing to merge to
+	 * the right. Does not reset state upon failure to merge. The VMA
+	 * iterator is assumed to be positioned at the previous VMA, rather than
+	 * at the gap.
+	 */
+	VMG_FLAG_JUST_EXPAND = 1 << 0,
+};
+
 /* Represents a VMA merge operation. */
 struct vma_merge_struct {
 	struct mm_struct *mm;
@ -75,6 +86,7 @@ struct vma_merge_struct {
 	struct mempolicy *policy;
 	struct vm_userfaultfd_ctx uffd_ctx;
 	struct anon_vma_name *anon_name;
+	enum vma_merge_flags merge_flags;
 	enum vma_merge_state state;
 };

@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.flags = flags_,					\
 		.pgoff = pgoff_,					\
 		.state = VMA_MERGE_START,				\
+		.merge_flags = VMG_FLAG_DEFAULT,			\
 	}

 #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_)	\
@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.uffd_ctx = vma_->vm_userfaultfd_ctx,		\
 		.anon_name = anon_vma_name(vma_),		\
 		.state = VMA_MERGE_START,			\
+		.merge_flags = VMG_FLAG_DEFAULT,		\
 	}

 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
 	 * failure method of leaving a gap where the MAP_FIXED mapping failed.
 	 */
 	mas_set_range(mas, vms->start, vms->end - 1);
-	if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
-		pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
-			     current->comm, current->pid);
-		/* Leaving vmas detached and in-tree may hamper recovery */
-		reattach_vmas(mas_detach);
-	} else {
-		/* Clean up the insertion of the unfortunate gap */
-		vms_complete_munmap_vmas(vms, mas_detach);
-	}
+	mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
+	/* Clean up the insertion of the unfortunate gap */
+	vms_complete_munmap_vmas(vms, mas_detach);
 }

 int
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@ -94,6 +94,7 @@
 #define ARM_CPU_PART_NEOVERSE_V3	0xD84
 #define ARM_CPU_PART_CORTEX_X925	0xD85
 #define ARM_CPU_PART_CORTEX_A725	0xD87
+#define ARM_CPU_PART_NEOVERSE_N3	0xD8E

 #define APM_CPU_PART_XGENE		0x000
 #define APM_CPU_VAR_POTENZA		0x00
@ -176,6 +177,7 @@
 #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3)
 #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925)
 #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725)
+#define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@ -36,6 +36,20 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)

+/*
+ * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
+ * Most MSRs support/allow only a subset of memory types, but the values
+ * themselves are common across all relevant MSRs.
+ */
+#define X86_MEMTYPE_UC		0ull	/* Uncacheable, a.k.a. Strong Uncacheable */
+#define X86_MEMTYPE_WC		1ull	/* Write Combining */
+/* RESERVED			2 */
+/* RESERVED			3 */
+#define X86_MEMTYPE_WT		4ull	/* Write Through */
+#define X86_MEMTYPE_WP		5ull	/* Write Protected */
+#define X86_MEMTYPE_WB		6ull	/* Write Back */
+#define X86_MEMTYPE_UC_MINUS	7ull	/* Weak Uncacheabled (PAT only) */
+
 /* FRED MSRs */
 #define MSR_IA32_FRED_RSP0	0x1cc			/* Level 0 stack pointer */
 #define MSR_IA32_FRED_RSP1	0x1cd			/* Level 1 stack pointer */
@ -365,6 +379,12 @@

 #define MSR_IA32_CR_PAT			0x00000277

+#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7)			\
+	((X86_MEMTYPE_ ## p0)      | (X86_MEMTYPE_ ## p1 << 8)  |	\
+	(X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) |	\
+	(X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) |	\
+	(X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
+
 #define MSR_IA32_DEBUGCTLMSR		0x000001d9
 #define MSR_IA32_LASTBRANCHFROMIP	0x000001db
 #define MSR_IA32_LASTBRANCHTOIP		0x000001dc
@ -1159,15 +1179,6 @@
 #define MSR_IA32_VMX_VMFUNC             0x00000491
 #define MSR_IA32_VMX_PROCBASED_CTLS3	0x00000492

-/* VMX_BASIC bits and bitmasks */
-#define VMX_BASIC_VMCS_SIZE_SHIFT	32
-#define VMX_BASIC_TRUE_CTLS		(1ULL << 55)
-#define VMX_BASIC_64		0x0001000000000000LLU
-#define VMX_BASIC_MEM_TYPE_SHIFT	50
-#define VMX_BASIC_MEM_TYPE_MASK	0x003c000000000000LLU
-#define VMX_BASIC_MEM_TYPE_WB	6LLU
-#define VMX_BASIC_INOUT		0x0040000000000000LLU
-
 /* Resctrl MSRs: */
 /* - Intel: */
 #define MSR_IA32_L3_QOS_CFG		0xc81
@ -1185,11 +1196,6 @@
 #define MSR_IA32_SMBA_BW_BASE		0xc0000280
 #define MSR_IA32_EVT_CFG_BASE		0xc0000400

-/* MSR_IA32_VMX_MISC bits */
-#define MSR_IA32_VMX_MISC_INTEL_PT                 (1ULL << 14)
-#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
-#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
-
 /* AMD-V MSRs */
 #define MSR_VM_CR                       0xc0010114
 #define MSR_VM_IGNNE                    0xc0010115
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@ -439,6 +439,7 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT	(1 << 4)
 #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN	(1 << 5)
 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6)
+#define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7)

 #define KVM_STATE_NESTED_FORMAT_VMX	0
 #define KVM_STATE_NESTED_FORMAT_SVM	1
--- a/tools/arch/x86/include/uapi/asm/unistd_32.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_32.h
@ -11,6 +11,9 @@
 #ifndef __NR_getpgid
 #define __NR_getpgid 132
 #endif
+#ifndef __NR_capget
+#define __NR_capget 184
+#endif
 #ifndef __NR_gettid
 #define __NR_gettid 224
 #endif
--- a/tools/arch/x86/include/uapi/asm/unistd_64.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_64.h
@ -11,6 +11,9 @@
 #ifndef __NR_getpgid
 #define __NR_getpgid 121
 #endif
+#ifndef __NR_capget
+#define __NR_capget 125
+#endif
 #ifndef __NR_gettid
 #define __NR_gettid 186
 #endif
--- a/tools/include/linux/bits.h
+++ b/tools/include/linux/bits.h
@ -36,4 +36,19 @@
 #define GENMASK_ULL(h, l) \
 	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))

+#if !defined(__ASSEMBLY__)
+/*
+ * Missing asm support
+ *
+ * __GENMASK_U128() depends on _BIT128() which would not work
+ * in the asm code, as it shifts an 'unsigned __init128' data
+ * type instead of direct representation of 128 bit constants
+ * such as long and unsigned long. The fundamental problem is
+ * that a 128 bit constant will get silently truncated by the
+ * gcc compiler.
+ */
+#define GENMASK_U128(h, l) \
+	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l))
+#endif
+
 #endif	/* __LINUX_BITS_H */
--- a/tools/include/linux/unaligned.h
+++ b/tools/include/linux/unaligned.h
@ -9,16 +9,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpacked"
 #pragma GCC diagnostic ignored "-Wattributes"
-
-#define __get_unaligned_t(type, ptr) ({						\
-	const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);	\
-	__pptr->x;								\
-})
-
-#define __put_unaligned_t(type, val, ptr) do {					\
-	struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);		\
-	__pptr->x = (val);							\
-} while (0)
+#include <vdso/unaligned.h>

 #define get_unaligned(ptr)	__get_unaligned_t(typeof(*(ptr)), (ptr))
 #define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr))
--- a/tools/include/uapi/linux/bits.h
+++ b/tools/include/uapi/linux/bits.h
@ -12,4 +12,7 @@
        (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \
         (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h))))

+#define __GENMASK_U128(h, l) \
+	((_BIT128((h)) << 1) - (_BIT128(l)))
+
 #endif /* _UAPI_LINUX_BITS_H */
--- a/tools/include/uapi/linux/const.h
+++ b/tools/include/uapi/linux/const.h
@ -28,6 +28,23 @@
 #define _BITUL(x)	(_UL(1) << (x))
 #define _BITULL(x)	(_ULL(1) << (x))

+#if !defined(__ASSEMBLY__)
+/*
+ * Missing asm support
+ *
+ * __BIT128() would not work in the asm code, as it shifts an
+ * 'unsigned __init128' data type as direct representation of
+ * 128 bit constants is not supported in the gcc compiler, as
+ * they get silently truncated.
+ *
+ * TODO: Please revisit this implementation when gcc compiler
+ * starts representing 128 bit constants directly like long
+ * and unsigned long etc. Subsequently drop the comment for
+ * GENMASK_U128() which would then start supporting asm code.
+ */
+#define _BIT128(x)	((unsigned __int128)(1) << (x))
+#endif
+
 #define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))

--- a/tools/include/vdso/unaligned.h
+++ b/tools/include/vdso/unaligned.h
@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_UNALIGNED_H
+#define __VDSO_UNALIGNED_H
+
+#define __get_unaligned_t(type, ptr) ({						\
+	const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);	\
+	__pptr->x;								\
+})
+
+#define __put_unaligned_t(type, val, ptr) do {					\
+	struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);		\
+	__pptr->x = (val);							\
+} while (0)
+
+#endif /* __VDSO_UNALIGNED_H */
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@ -704,8 +704,8 @@ ifeq ($(BUILD_BPF_SKEL),1)
    BUILD_BPF_SKEL := 0
  else
    CLANG_VERSION := $(shell $(CLANG) --version | head -1 | sed 's/.*clang version \([[:digit:]]\+.[[:digit:]]\+.[[:digit:]]\+\).*/\1/g')
-    ifeq ($(call version-lt3,$(CLANG_VERSION),16.0.6),1)
-      $(warning Warning: Disabled BPF skeletons as at least $(CLANG) version 16.0.6 is reported to be a working setup with the current of BPF based perf features)
+    ifeq ($(call version-lt3,$(CLANG_VERSION),12.0.1),1)
+      $(warning Warning: Disabled BPF skeletons as reliable BTF generation needs at least $(CLANG) version 12.0.1)
      BUILD_BPF_SKEL := 0
    endif
  endif
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@ -1399,7 +1399,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 	{ .name	    = "waitid",	    .errpid = true,
 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
-	{ .name	    = "write",	    .errpid = true,
+	{ .name	    = "write",
 	  .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
 };

--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@ -22,6 +22,7 @@ FILES=(
  "include/vdso/bits.h"
  "include/linux/const.h"
  "include/vdso/const.h"
+  "include/vdso/unaligned.h"
  "include/linux/hash.h"
  "include/linux/list-sort.h"
  "include/uapi/linux/hw_breakpoint.h"
--- a/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh
+++ b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh
@ -19,35 +19,74 @@
 TEST_RESULT=0

 # skip if not supported
-BLACKFUNC=`head -n 1 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
-if [ -z "$BLACKFUNC" ]; then
+BLACKFUNC_LIST=`head -n 5 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
+if [ -z "$BLACKFUNC_LIST" ]; then
 	print_overall_skipped
 	exit 0
 fi

+# try to find vmlinux with DWARF debug info
+VMLINUX_FILE=$(perf probe -v random_probe |& grep "Using.*for symbols" | sed -r 's/^Using (.*) for symbols$/\1/')
+
 # remove all previously added probes
 clear_all_probes


 ### adding blacklisted function
-
-# functions from blacklist should be skipped by perf probe
-! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
-PERF_EXIT_CODE=$?
-
 REGEX_SCOPE_FAIL="Failed to find scope of probe point"
 REGEX_SKIP_MESSAGE=" is blacklisted function, skip it\."
-REGEX_NOT_FOUND_MESSAGE="Probe point \'$BLACKFUNC\' not found."
+REGEX_NOT_FOUND_MESSAGE="Probe point \'$RE_EVENT\' not found."
 REGEX_ERROR_MESSAGE="Error: Failed to add events."
 REGEX_INVALID_ARGUMENT="Failed to write event: Invalid argument"
 REGEX_SYMBOL_FAIL="Failed to find symbol at $RE_ADDRESS"
-REGEX_OUT_SECTION="$BLACKFUNC is out of \.\w+, skip it"
-../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
-CHECK_EXIT_CODE=$?
+REGEX_OUT_SECTION="$RE_EVENT is out of \.\w+, skip it"
+REGEX_MISSING_DECL_LINE="A function DIE doesn't have decl_line. Maybe broken DWARF?"

-print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
-(( TEST_RESULT += $? ))
+BLACKFUNC=""
+SKIP_DWARF=0

+for BLACKFUNC in $BLACKFUNC_LIST; do
+	echo "Probing $BLACKFUNC"
+
+	# functions from blacklist should be skipped by perf probe
+	! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
+	PERF_EXIT_CODE=$?
+
+	# check for bad DWARF polluting the result
+	../common/check_all_patterns_found.pl "$REGEX_MISSING_DECL_LINE" >/dev/null < $LOGS_DIR/adding_blacklisted.err
+
+	if [ $? -eq 0 ]; then
+		SKIP_DWARF=1
+		echo "Result polluted by broken DWARF, trying another probe"
+
+		# confirm that the broken DWARF comes from assembler
+		if [ -n "$VMLINUX_FILE" ]; then
+			readelf -wi "$VMLINUX_FILE" |
+			awk -v probe="$BLACKFUNC" '/DW_AT_language/ { comp_lang = $0 }
+						   $0 ~ probe { if (comp_lang) { print comp_lang }; exit }' |
+			grep -q "MIPS assembler"
+
+			CHECK_EXIT_CODE=$?
+			if [ $CHECK_EXIT_CODE -ne 0 ]; then
+				SKIP_DWARF=0 # broken DWARF while available
+				break
+			fi
+		fi
+	else
+		../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
+		CHECK_EXIT_CODE=$?
+
+		SKIP_DWARF=0
+		break
+	fi
+done
+
+if [ $SKIP_DWARF -eq 1 ]; then
+	print_testcase_skipped "adding blacklisted function $BLACKFUNC"
+else
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
+	(( TEST_RESULT += $? ))
+fi

 ### listing not-added probe

--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@ -288,6 +288,10 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
 	len += augmented_args->arg.size;

+	/* Every read from userspace is limited to value size */
+	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
+		return 1; /* Failure: don't filter */
+
 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;

 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
@ -315,6 +319,10 @@ int sys_enter_renameat2(struct syscall_enter_args *args)
 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
 	len += augmented_args->arg.size;

+	/* Every read from userspace is limited to value size */
+	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
+		return 1; /* Failure: don't filter */
+
 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;

 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
@ -423,8 +431,9 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
 	bool augmented, do_output = false;
-	int zero = 0, size, aug_size, index, output = 0,
+	int zero = 0, size, aug_size, index,
 	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
 	unsigned int nr, *beauty_map;
 	struct beauty_payload_enter *payload;
 	void *arg, *payload_offset;
@ -477,6 +486,8 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 				augmented = true;
 		} else if (size < 0 && size >= -6) { /* buffer */
 			index = -(size + 1);
+			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
+			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
 			aug_size = args->args[index];

 			if (aug_size > TRACE_AUG_MAX_BUF)
@ -488,10 +499,17 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 			}
 		}

+		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
+		if (aug_size > value_size)
+			aug_size = value_size;
+
 		/* write data to payload */
 		if (augmented) {
 			int written = offsetof(struct augmented_arg, value) + aug_size;

+			if (written < 0 || written > sizeof(struct augmented_arg))
+				return 1;
+
 			((struct augmented_arg *)payload_offset)->size = aug_size;
 			output += written;
 			payload_offset += written;
@ -499,7 +517,7 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 		}
 	}

-	if (!do_output)
+	if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
 		return 1;

 	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
--- a/tools/perf/util/cap.c
+++ b/tools/perf/util/cap.c
@ -7,13 +7,9 @@
 #include "debug.h"
 #include <errno.h>
 #include <string.h>
-#include <unistd.h>
 #include <linux/capability.h>
 #include <sys/syscall.h>
-
-#ifndef SYS_capget
-#define SYS_capget 90
-#endif
+#include <unistd.h>

 #define MAX_LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3

@ -21,9 +17,9 @@ bool perf_cap__capable(int cap, bool *used_root)
 {
 	struct __user_cap_header_struct header = {
 		.version = _LINUX_CAPABILITY_VERSION_3,
-		.pid = getpid(),
+		.pid = 0,
 	};
-	struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S];
+	struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S] = {};
 	__u32 cap_val;

 	*used_root = false;
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@ -19,6 +19,7 @@
 #include "util/bpf-filter.h"
 #include "util/env.h"
 #include "util/kvm-stat.h"
+#include "util/stat.h"
 #include "util/kwork.h"
 #include "util/sample.h"
 #include "util/lock-contention.h"
@ -1355,6 +1356,7 @@ PyMODINIT_FUNC PyInit_perf(void)

 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;

+#ifdef HAVE_KVM_STAT_SUPPORT
 bool kvm_entry_event(struct evsel *evsel __maybe_unused)
 {
 	return false;
@ -1384,6 +1386,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 			   char *decode __maybe_unused)
 {
 }
+#endif // HAVE_KVM_STAT_SUPPORT

 int find_scripts(char **scripts_array  __maybe_unused, char **scripts_path_array  __maybe_unused,
 		int num  __maybe_unused, int pathlen __maybe_unused)
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@ -46,6 +46,11 @@ static const char *const *syscalltbl_native = syscalltbl_mips_n64;
 #include <asm/syscalls.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_LOONGARCH_MAX_ID;
 static const char *const *syscalltbl_native = syscalltbl_loongarch;
+#else
+const int syscalltbl_native_max_id = 0;
+static const char *const syscalltbl_native[] = {
+	[0] = "unknown",
+};
 #endif

 struct syscall {
@ -182,6 +187,11 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name)
 	return audit_name_to_syscall(name, tbl->audit_machine);
 }

+int syscalltbl__id_at_idx(struct syscalltbl *tbl __maybe_unused, int idx)
+{
+	return idx;
+}
+
 int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused,
 				  const char *syscall_glob __maybe_unused, int *idx __maybe_unused)
 {
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
 /*
 * Access a cpumask in read-only mode (typically to check bits).
 */
-const struct cpumask *cast_mask(struct bpf_cpumask *mask)
+static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
 {
 	return (const struct cpumask *)mask;
 }
--- a/tools/testing/selftests/mm/uffd-common.c
+++ b/tools/testing/selftests/mm/uffd-common.c
@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
 unsigned long long *count_verify;
 uffd_test_ops_t *uffd_test_ops;
 uffd_test_case_ops_t *uffd_test_case_ops;
-pthread_barrier_t ready_for_fork;
+atomic_bool ready_for_fork;

 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
 {
@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
 	pollfd[1].fd = pipefd[cpu*2];
 	pollfd[1].events = POLLIN;

-	/* Ready for parent thread to fork */
-	pthread_barrier_wait(&ready_for_fork);
+	ready_for_fork = true;

 	for (;;) {
 		ret = poll(pollfd, 2, -1);
--- a/tools/testing/selftests/mm/uffd-common.h
+++ b/tools/testing/selftests/mm/uffd-common.h
@ -33,6 +33,7 @@
 #include <inttypes.h>
 #include <stdint.h>
 #include <sys/random.h>
+#include <stdatomic.h>

 #include "../kselftest.h"
 #include "vm_util.h"
@ -104,7 +105,7 @@ extern bool map_shared;
 extern bool test_uffdio_wp;
 extern unsigned long long *count_verify;
 extern volatile bool test_uffdio_copy_eexist;
-extern pthread_barrier_t ready_for_fork;
+extern atomic_bool ready_for_fork;

 extern uffd_test_ops_t anon_uffd_test_ops;
 extern uffd_test_ops_t shmem_uffd_test_ops;
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@ -241,8 +241,7 @@ static void *fork_event_consumer(void *data)
 	fork_event_args *args = data;
 	struct uffd_msg msg = { 0 };

-	/* Ready for parent thread to fork */
-	pthread_barrier_wait(&ready_for_fork);
+	ready_for_fork = true;

 	/* Read until a full msg received */
 	while (uffd_read_msg(args->parent_uffd, &msg));
@ -311,12 +310,11 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)

 	/* Prepare a thread to resolve EVENT_FORK */
 	if (with_event) {
-		pthread_barrier_init(&ready_for_fork, NULL, 2);
+		ready_for_fork = false;
 		if (pthread_create(&thread, NULL, fork_event_consumer, &args))
 			err("pthread_create()");
-		/* Wait for child thread to start before forking */
-		pthread_barrier_wait(&ready_for_fork);
-		pthread_barrier_destroy(&ready_for_fork);
+		while (!ready_for_fork)
+			; /* Wait for the poll_thread to start executing before forking */
 	}

 	child = fork();
@ -781,7 +779,7 @@ static void uffd_sigbus_test_common(bool wp)
 	char c;
 	struct uffd_args args = { 0 };

-	pthread_barrier_init(&ready_for_fork, NULL, 2);
+	ready_for_fork = false;

 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);

@ -798,9 +796,8 @@ static void uffd_sigbus_test_common(bool wp)
 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
 		err("uffd_poll_thread create");

-	/* Wait for child thread to start before forking */
-	pthread_barrier_wait(&ready_for_fork);
-	pthread_barrier_destroy(&ready_for_fork);
+	while (!ready_for_fork)
+		; /* Wait for the poll_thread to start executing before forking */

 	pid = fork();
 	if (pid < 0)
@ -841,7 +838,7 @@ static void uffd_events_test_common(bool wp)
 	char c;
 	struct uffd_args args = { 0 };

-	pthread_barrier_init(&ready_for_fork, NULL, 2);
+	ready_for_fork = false;

 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
@ -852,9 +849,8 @@ static void uffd_events_test_common(bool wp)
 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
 		err("uffd_poll_thread create");

-	/* Wait for child thread to start before forking */
-	pthread_barrier_wait(&ready_for_fork);
-	pthread_barrier_destroy(&ready_for_fork);
+	while (!ready_for_fork)
+		; /* Wait for the poll_thread to start executing before forking */

 	pid = fork();
 	if (pid < 0)
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@ -184,7 +184,7 @@ auto-test-targets :=			\

 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))

-$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) $(BPFOBJ)
 	$(CC) $(CFLAGS) -c $< -o $@

 # Create all of the test targets object files, whose testcase objects will be
--- a/tools/testing/selftests/sched_ext/create_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c
@ -51,8 +51,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init)

 SEC(".struct_ops.link")
 struct sched_ext_ops create_dsq_ops = {
-	.init_task		= create_dsq_init_task,
-	.exit_task		= create_dsq_exit_task,
-	.init			= create_dsq_init,
+	.init_task		= (void *) create_dsq_init_task,
+	.exit_task		= (void *) create_dsq_exit_task,
+	.init			= (void *) create_dsq_init,
 	.name			= "create_dsq",
 };
--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
@ -35,8 +35,8 @@ void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)

 SEC(".struct_ops.link")
 struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
-	.select_cpu		= ddsp_bogus_dsq_fail_select_cpu,
-	.exit			= ddsp_bogus_dsq_fail_exit,
+	.select_cpu		= (void *) ddsp_bogus_dsq_fail_select_cpu,
+	.exit			= (void *) ddsp_bogus_dsq_fail_exit,
 	.name			= "ddsp_bogus_dsq_fail",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
@ -32,8 +32,8 @@ void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)

 SEC(".struct_ops.link")
 struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
-	.select_cpu		= ddsp_vtimelocal_fail_select_cpu,
-	.exit			= ddsp_vtimelocal_fail_exit,
+	.select_cpu		= (void *) ddsp_vtimelocal_fail_select_cpu,
+	.exit			= (void *) ddsp_vtimelocal_fail_exit,
 	.name			= "ddsp_vtimelocal_fail",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
@ -56,10 +56,10 @@ void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei)

 SEC(".struct_ops.link")
 struct sched_ext_ops dsp_local_on_ops = {
-	.select_cpu		= dsp_local_on_select_cpu,
-	.enqueue		= dsp_local_on_enqueue,
-	.dispatch		= dsp_local_on_dispatch,
-	.exit			= dsp_local_on_exit,
+	.select_cpu		= (void *) dsp_local_on_select_cpu,
+	.enqueue		= (void *) dsp_local_on_enqueue,
+	.dispatch		= (void *) dsp_local_on_dispatch,
+	.exit			= (void *) dsp_local_on_exit,
 	.name			= "dsp_local_on",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
@ -12,10 +12,18 @@

 char _license[] SEC("license") = "GPL";

+u32 exit_kind;
+
+void BPF_STRUCT_OPS_SLEEPABLE(enq_last_no_enq_fails_exit, struct scx_exit_info *info)
+{
+	exit_kind = info->kind;
+}
+
 SEC(".struct_ops.link")
 struct sched_ext_ops enq_last_no_enq_fails_ops = {
 	.name			= "enq_last_no_enq_fails",
 	/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
 	.flags			= SCX_OPS_ENQ_LAST,
+	.exit			= (void *) enq_last_no_enq_fails_exit,
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
@ -31,8 +31,12 @@ static enum scx_test_status run(void *ctx)
 	struct bpf_link *link;

 	link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
-	if (link) {
-		SCX_ERR("Incorrectly succeeded in to attaching scheduler");
+	if (!link) {
+		SCX_ERR("Incorrectly failed at attaching scheduler");
+		return SCX_TEST_FAIL;
+	}
+	if (!skel->bss->exit_kind) {
+		SCX_ERR("Incorrectly stayed loaded");
 		return SCX_TEST_FAIL;
 	}

@ -50,7 +54,7 @@ static void cleanup(void *ctx)

 struct scx_test enq_last_no_enq_fails = {
 	.name = "enq_last_no_enq_fails",
-	.description = "Verify we fail to load a scheduler if we specify "
+	.description = "Verify we eject a scheduler if we specify "
 		       "the SCX_OPS_ENQ_LAST flag without defining "
 		       "ops.enqueue()",
 	.setup = setup,
--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
@ -36,8 +36,8 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,

 SEC(".struct_ops.link")
 struct sched_ext_ops enq_select_cpu_fails_ops = {
-	.select_cpu		= enq_select_cpu_fails_select_cpu,
-	.enqueue		= enq_select_cpu_fails_enqueue,
+	.select_cpu		= (void *) enq_select_cpu_fails_select_cpu,
+	.enqueue		= (void *) enq_select_cpu_fails_enqueue,
 	.name			= "enq_select_cpu_fails",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/exit.bpf.c
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@ -15,6 +15,8 @@ UEI_DEFINE(uei);

 #define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point)

+#define DSQ_ID 0
+
 s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@ -31,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
 	if (exit_point == EXIT_ENQUEUE)
 		EXIT_CLEANLY();

-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
 }

 void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
@ -39,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
 	if (exit_point == EXIT_DISPATCH)
 		EXIT_CLEANLY();

-	scx_bpf_consume(SCX_DSQ_GLOBAL);
+	scx_bpf_consume(DSQ_ID);
 }

 void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
@ -67,18 +69,18 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init)
 	if (exit_point == EXIT_INIT)
 		EXIT_CLEANLY();

-	return 0;
+	return scx_bpf_create_dsq(DSQ_ID, -1);
 }

 SEC(".struct_ops.link")
 struct sched_ext_ops exit_ops = {
-	.select_cpu		= exit_select_cpu,
-	.enqueue		= exit_enqueue,
-	.dispatch		= exit_dispatch,
-	.init_task		= exit_init_task,
-	.enable			= exit_enable,
-	.exit			= exit_exit,
-	.init			= exit_init,
+	.select_cpu		= (void *) exit_select_cpu,
+	.enqueue		= (void *) exit_enqueue,
+	.dispatch		= (void *) exit_dispatch,
+	.init_task		= (void *) exit_init_task,
+	.enable			= (void *) exit_enable,
+	.exit			= (void *) exit_exit,
+	.init			= (void *) exit_init,
 	.name			= "exit",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/hotplug.bpf.c
+++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c
@ -46,16 +46,16 @@ void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu)

 SEC(".struct_ops.link")
 struct sched_ext_ops hotplug_cb_ops = {
-	.cpu_online		= hotplug_cpu_online,
-	.cpu_offline		= hotplug_cpu_offline,
-	.exit			= hotplug_exit,
+	.cpu_online		= (void *) hotplug_cpu_online,
+	.cpu_offline		= (void *) hotplug_cpu_offline,
+	.exit			= (void *) hotplug_exit,
 	.name			= "hotplug_cbs",
 	.timeout_ms		= 1000U,
 };

 SEC(".struct_ops.link")
 struct sched_ext_ops hotplug_nocb_ops = {
-	.exit			= hotplug_exit,
+	.exit			= (void *) hotplug_exit,
 	.name			= "hotplug_nocbs",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
+++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
@ -45,9 +45,9 @@ void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)

 SEC(".struct_ops.link")
 struct sched_ext_ops init_enable_count_ops = {
-	.init_task	= cnt_init_task,
-	.exit_task	= cnt_exit_task,
-	.enable		= cnt_enable,
-	.disable	= cnt_disable,
+	.init_task	= (void *) cnt_init_task,
+	.exit_task	= (void *) cnt_exit_task,
+	.enable		= (void *) cnt_enable,
+	.disable	= (void *) cnt_disable,
 	.name		= "init_enable_count",
 };
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@ -131,34 +131,34 @@ void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)

 SEC(".struct_ops.link")
 struct sched_ext_ops maximal_ops = {
-	.select_cpu		= maximal_select_cpu,
-	.enqueue		= maximal_enqueue,
-	.dequeue		= maximal_dequeue,
-	.dispatch		= maximal_dispatch,
-	.runnable		= maximal_runnable,
-	.running		= maximal_running,
-	.stopping		= maximal_stopping,
-	.quiescent		= maximal_quiescent,
-	.yield			= maximal_yield,
-	.core_sched_before	= maximal_core_sched_before,
-	.set_weight		= maximal_set_weight,
-	.set_cpumask		= maximal_set_cpumask,
-	.update_idle		= maximal_update_idle,
-	.cpu_acquire		= maximal_cpu_acquire,
-	.cpu_release		= maximal_cpu_release,
-	.cpu_online		= maximal_cpu_online,
-	.cpu_offline		= maximal_cpu_offline,
-	.init_task		= maximal_init_task,
-	.enable			= maximal_enable,
-	.exit_task		= maximal_exit_task,
-	.disable		= maximal_disable,
-	.cgroup_init		= maximal_cgroup_init,
-	.cgroup_exit		= maximal_cgroup_exit,
-	.cgroup_prep_move	= maximal_cgroup_prep_move,
-	.cgroup_move		= maximal_cgroup_move,
-	.cgroup_cancel_move	= maximal_cgroup_cancel_move,
-	.cgroup_set_weight	= maximal_cgroup_set_weight,
-	.init			= maximal_init,
-	.exit			= maximal_exit,
+	.select_cpu		= (void *) maximal_select_cpu,
+	.enqueue		= (void *) maximal_enqueue,
+	.dequeue		= (void *) maximal_dequeue,
+	.dispatch		= (void *) maximal_dispatch,
+	.runnable		= (void *) maximal_runnable,
+	.running		= (void *) maximal_running,
+	.stopping		= (void *) maximal_stopping,
+	.quiescent		= (void *) maximal_quiescent,
+	.yield			= (void *) maximal_yield,
+	.core_sched_before	= (void *) maximal_core_sched_before,
+	.set_weight		= (void *) maximal_set_weight,
+	.set_cpumask		= (void *) maximal_set_cpumask,
+	.update_idle		= (void *) maximal_update_idle,
+	.cpu_acquire		= (void *) maximal_cpu_acquire,
+	.cpu_release		= (void *) maximal_cpu_release,
+	.cpu_online		= (void *) maximal_cpu_online,
+	.cpu_offline		= (void *) maximal_cpu_offline,
+	.init_task		= (void *) maximal_init_task,
+	.enable			= (void *) maximal_enable,
+	.exit_task		= (void *) maximal_exit_task,
+	.disable		= (void *) maximal_disable,
+	.cgroup_init		= (void *) maximal_cgroup_init,
+	.cgroup_exit		= (void *) maximal_cgroup_exit,
+	.cgroup_prep_move	= (void *) maximal_cgroup_prep_move,
+	.cgroup_move		= (void *) maximal_cgroup_move,
+	.cgroup_cancel_move	= (void *) maximal_cgroup_cancel_move,
+	.cgroup_set_weight	= (void *) maximal_cgroup_set_weight,
+	.init			= (void *) maximal_init,
+	.exit			= (void *) maximal_exit,
 	.name			= "maximal",
 };
--- a/tools/testing/selftests/sched_ext/maybe_null.bpf.c
+++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c
@ -29,8 +29,8 @@ bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from,

 SEC(".struct_ops.link")
 struct sched_ext_ops maybe_null_success = {
-	.dispatch               = maybe_null_success_dispatch,
-	.yield			= maybe_null_success_yield,
-	.enable			= maybe_null_running,
+	.dispatch               = (void *) maybe_null_success_dispatch,
+	.yield			= (void *) maybe_null_success_yield,
+	.enable			= (void *) maybe_null_running,
 	.name			= "minimal",
 };
--- a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
@ -19,7 +19,7 @@ void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)

 SEC(".struct_ops.link")
 struct sched_ext_ops maybe_null_fail = {
-	.dispatch               = maybe_null_fail_dispatch,
-	.enable			= maybe_null_running,
+	.dispatch               = (void *) maybe_null_fail_dispatch,
+	.enable			= (void *) maybe_null_running,
 	.name			= "maybe_null_fail_dispatch",
 };
--- a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
@ -22,7 +22,7 @@ bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from,

 SEC(".struct_ops.link")
 struct sched_ext_ops maybe_null_fail = {
-	.yield			= maybe_null_fail_yield,
-	.enable			= maybe_null_running,
+	.yield			= (void *) maybe_null_fail_yield,
+	.enable			= (void *) maybe_null_running,
 	.name			= "maybe_null_fail_yield",
 };
--- a/tools/testing/selftests/sched_ext/prog_run.bpf.c
+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c
@ -28,6 +28,6 @@ void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei)

 SEC(".struct_ops.link")
 struct sched_ext_ops prog_run_ops = {
-	.exit			= prog_run_exit,
+	.exit			= (void *) prog_run_exit,
 	.name			= "prog_run",
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
@ -35,6 +35,6 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,

 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dfl_ops = {
-	.enqueue		= select_cpu_dfl_enqueue,
+	.enqueue		= (void *) select_cpu_dfl_enqueue,
 	.name			= "select_cpu_dfl",
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
@ -82,8 +82,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,

 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
-	.select_cpu		= select_cpu_dfl_nodispatch_select_cpu,
-	.enqueue		= select_cpu_dfl_nodispatch_enqueue,
-	.init_task		= select_cpu_dfl_nodispatch_init_task,
+	.select_cpu		= (void *) select_cpu_dfl_nodispatch_select_cpu,
+	.enqueue		= (void *) select_cpu_dfl_nodispatch_enqueue,
+	.init_task		= (void *) select_cpu_dfl_nodispatch_init_task,
 	.name			= "select_cpu_dfl_nodispatch",
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
@ -35,7 +35,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,

 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dispatch_ops = {
-	.select_cpu		= select_cpu_dispatch_select_cpu,
+	.select_cpu		= (void *) select_cpu_dispatch_select_cpu,
 	.name			= "select_cpu_dispatch",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
@ -30,8 +30,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)

 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
-	.select_cpu		= select_cpu_dispatch_bad_dsq_select_cpu,
-	.exit			= select_cpu_dispatch_bad_dsq_exit,
+	.select_cpu		= (void *) select_cpu_dispatch_bad_dsq_select_cpu,
+	.exit			= (void *) select_cpu_dispatch_bad_dsq_exit,
 	.name			= "select_cpu_dispatch_bad_dsq",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
@ -31,8 +31,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)

 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
-	.select_cpu		= select_cpu_dispatch_dbl_dsp_select_cpu,
-	.exit			= select_cpu_dispatch_dbl_dsp_exit,
+	.select_cpu		= (void *) select_cpu_dispatch_dbl_dsp_select_cpu,
+	.exit			= (void *) select_cpu_dispatch_dbl_dsp_exit,
 	.name			= "select_cpu_dispatch_dbl_dsp",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@ -81,12 +81,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)

 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_vtime_ops = {
-	.select_cpu		= select_cpu_vtime_select_cpu,
-	.dispatch		= select_cpu_vtime_dispatch,
-	.running		= select_cpu_vtime_running,
-	.stopping		= select_cpu_vtime_stopping,
-	.enable			= select_cpu_vtime_enable,
-	.init			= select_cpu_vtime_init,
+	.select_cpu		= (void *) select_cpu_vtime_select_cpu,
+	.dispatch		= (void *) select_cpu_vtime_dispatch,
+	.running		= (void *) select_cpu_vtime_running,
+	.stopping		= (void *) select_cpu_vtime_stopping,
+	.enable			= (void *) select_cpu_vtime_enable,
+	.init			= (void *) select_cpu_vtime_init,
 	.name			= "select_cpu_vtime",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@ -1522,6 +1522,45 @@ static bool test_copy_vma(void)
 	return true;
 }

+static bool test_expand_only_mode(void)
+{
+	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma_prev, *vma;
+	VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5);
+
+	/*
+	 * Place a VMA prior to the one we're expanding so we assert that we do
+	 * not erroneously try to traverse to the previous VMA even though we
+	 * have, through the use of VMG_FLAG_JUST_EXPAND, indicated we do not
+	 * need to do so.
+	 */
+	alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+
+	/*
+	 * We will be positioned at the prev VMA, but looking to expand to
+	 * 0x9000.
+	 */
+	vma_iter_set(&vmi, 0x3000);
+	vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vmg.prev = vma_prev;
+	vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
+
+	vma = vma_merge_new_range(&vmg);
+	ASSERT_NE(vma, NULL);
+	ASSERT_EQ(vma, vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma->vm_start, 0x3000);
+	ASSERT_EQ(vma->vm_end, 0x9000);
+	ASSERT_EQ(vma->vm_pgoff, 3);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
 int main(void)
 {
 	int num_tests = 0, num_fail = 0;
@ -1553,6 +1592,7 @@ int main(void)
 	TEST(vmi_prealloc_fail);
 	TEST(merge_extend);
 	TEST(copy_vma);
+	TEST(expand_only_mode);

 #undef TEST