Merge e2cab215a5 into 0fc810ae3a

x86/uaccess: Avoid barrier_nospec() in 64-bit copy_from_user()
The barrier_nospec() in 64-bit copy_from_user() is slow. Instead use pointer masking to force the user pointer to all 1's for an invalid address. The kernel test robot reports a 2.6% improvement in the per_thread_ops benchmark [1]. This is a variation on a patch originally by Josh Poimboeuf [2]. Link: https://lore.kernel.org/202410281344.d02c72a2-oliver.sang@intel.com [1] Link: https://lore.kernel.org/5b887fe4c580214900e21f6c61095adf9a142735.1730166635.git.jpoimboe@kernel.org [2] Tested-and-reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-11-01 04:53:36 +01:00 · 2024-10-31 15:03:10 +00:00 · 2024-10-30 11:38:10 -10:00 · 2024-10-30 11:17:47 -10:00 · 2024-10-30 08:16:23 -10:00 · 2024-10-29 16:41:30 -10:00
79 changed files with 882 additions and 441 deletions
--- a/Documentation/images/linux_img.png
+++ b/Documentation/images/linux_img.png
--- a/Documentation/linux_img.png
+++ b/Documentation/linux_img.png
--- a/Documentation/userspace-api/mseal.rst
+++ b/Documentation/userspace-api/mseal.rst
@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime.
 A similar feature already exists in the XNU kernel with the
 VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].
-User API
+SYSCALL
-========
+=======
-mseal()
+mseal syscall signature
-----------
+-----------------------
-The mseal() syscall has the following signature:
+   ``int mseal(void \* addr, size_t len, unsigned long flags)``
-``int mseal(void addr, size_t len, unsigned long flags)``
+   **addr**/**len**: virtual memory address range.
      The address range set by **addr**/**len** must meet:
         - The start address must be in an allocated VMA.
         - The start address must be page aligned.
         - The end address (**addr** + **len**) must be in an allocated VMA.
         - no gap (unallocated memory) between start and end address.
-**addr/len**: virtual memory address range.
+      The ``len`` will be paged aligned implicitly by the kernel.
-The address range set by ``addr``/``len`` must meet:
+   **flags**: reserved for future use.
   - The start address must be in an allocated VMA.
   - The start address must be page aligned.
   - The end address (``addr`` + ``len``) must be in an allocated VMA.
   - no gap (unallocated memory) between start and end address.
-The ``len`` will be paged aligned implicitly by the kernel.
+   **Return values**:
      - **0**: Success.
      - **-EINVAL**:
         * Invalid input ``flags``.
         * The start address (``addr``) is not page aligned.
         * Address range (``addr`` + ``len``) overflow.
      - **-ENOMEM**:
         * The start address (``addr``) is not allocated.
         * The end address (``addr`` + ``len``) is not allocated.
         * A gap (unallocated memory) between start and end address.
      - **-EPERM**:
         * sealing is supported only on 64-bit CPUs, 32-bit is not supported.
-**flags**: reserved for future use.
+   **Note about error return**:
      - For above error cases, users can expect the given memory range is
        unmodified, i.e. no partial update.
      - There might be other internal errors/cases not listed here, e.g.
        error during merging/splitting VMAs, or the process reaching the maximum
        number of supported VMAs. In those cases, partial updates to the given
        memory range could happen. However, those cases should be rare.
-**return values**:
+   **Architecture support**:
      mseal only works on 64-bit CPUs, not 32-bit CPUs.
- ``0``: Success.
+   **Idempotent**:
      users can call mseal multiple times. mseal on an already sealed memory
      is a no-action (not error).
- ``-EINVAL``:
+   **no munseal**
-    - Invalid input ``flags``.
+      Once mapping is sealed, it can't be unsealed. The kernel should never
-    - The start address (``addr``) is not page aligned.
+      have munseal, this is consistent with other sealing feature, e.g.
-    - Address range (``addr`` + ``len``) overflow.
+      F_SEAL_SEAL for file.
- ``-ENOMEM``:
+Blocked mm syscall for sealed mapping
-    - The start address (``addr``) is not allocated.
+-------------------------------------
-    - The end address (``addr`` + ``len``) is not allocated.
+   It might be important to note: **once the mapping is sealed, it will
-    - A gap (unallocated memory) between start and end address.
+   stay in the process's memory until the process terminates**.
- ``-EPERM``:
+   Example::
    - sealing is supported only on 64-bit CPUs, 32-bit is not supported.
- For above error cases, users can expect the given memory range is
+         *ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-  unmodified, i.e. no partial update.
+         rc = mseal(ptr, 4096, 0);
         /* munmap will fail */
         rc = munmap(ptr, 4096);
         assert(rc < 0);
- There might be other internal errors/cases not listed here, e.g.
+   Blocked mm syscall:
-  error during merging/splitting VMAs, or the process reaching the max
+      - munmap
-  number of supported VMAs. In those cases, partial updates to the given
+      - mmap
-  memory range could happen. However, those cases should be rare.
+      - mremap
      - mprotect and pkey_mprotect
      - some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE,
        MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK
-**Blocked operations after sealing**:
+   The first set of syscalls to block is munmap, mremap, mmap. They can
-    Unmapping, moving to another location, and shrinking the size,
+   either leave an empty space in the address space, therefore allowing
-    via munmap() and mremap(), can leave an empty space, therefore
+   replacement with a new mapping with new set of attributes, or can
-    can be replaced with a VMA with a new set of attributes.
+   overwrite the existing mapping with another mapping.
-    Moving or expanding a different VMA into the current location,
+   mprotect and pkey_mprotect are blocked because they changes the
-    via mremap().
+   protection bits (RWX) of the mapping.
-    Modifying a VMA via mmap(MAP_FIXED).
+   Certain destructive madvise behaviors, specifically MADV_DONTNEED,
   MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce
   risks when applied to anonymous memory by threads lacking write
   permissions. Consequently, these operations are prohibited under such
   conditions. The aforementioned behaviors have the potential to modify
   region contents by discarding pages, effectively performing a memset(0)
   operation on the anonymous memory.
-    Size expansion, via mremap(), does not appear to pose any
+   Kernel will return -EPERM for blocked syscalls.
    specific risks to sealed VMAs. It is included anyway because
    the use case is unclear. In any case, users can rely on
    merging to expand a sealed VMA.
-    mprotect() and pkey_mprotect().
+   When blocked syscall return -EPERM due to sealing, the memory regions may
   or may not be changed, depends on the syscall being blocked:
-    Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
+      - munmap: munmap is atomic. If one of VMAs in the given range is
-    for anonymous memory, when users don't have write permission to the
+        sealed, none of VMAs are updated.
-    memory. Those behaviors can alter region contents by discarding pages,
+      - mprotect, pkey_mprotect, madvise: partial update might happen, e.g.
-    effectively a memset(0) for anonymous memory.
+        when mprotect over multiple VMAs, mprotect might update the beginning
        VMAs before reaching the sealed VMA and return -EPERM.
      - mmap and mremap: undefined behavior.
-    Kernel will return -EPERM for blocked operations.
+Use cases
-
+=========
    For blocked operations, one can expect the given address is unmodified,
    i.e. no partial update. Note, this is different from existing mm
    system call behaviors, where partial updates are made till an error is
    found and returned to userspace. To give an example:
    Assume following code sequence:
    - ptr = mmap(null, 8192, PROT_NONE);
    - munmap(ptr + 4096, 4096);
    - ret1 = mprotect(ptr, 8192, PROT_READ);
    - mseal(ptr, 4096);
    - ret2 = mprotect(ptr, 8192, PROT_NONE);
    ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
    ret2 will be -EPERM, the page remains to be PROT_READ.
 **Note**:
 - mseal() only works on 64-bit CPUs, not 32-bit CPU.
 - users can call mseal() multiple times, mseal() on an already sealed memory
  is a no-action (not error).
 - munseal() is not supported.
 Use cases:
 ==========
 - glibc:
  The dynamic linker, during loading ELF executables, can apply sealing to
-  non-writable memory segments.
+  mapping segments.
- Chrome browser: protect some security sensitive data-structures.
+- Chrome browser: protect some security sensitive data structures.
-Notes on which memory to seal:
+When not to use mseal
-==============================
+=====================
-
+Applications can apply sealing to any virtual memory region from userspace,
-It might be important to note that sealing changes the lifetime of a mapping,
+but it is *crucial to thoroughly analyze the mapping's lifetime* prior to
-i.e. the sealed mapping won’t be unmapped till the process terminates or the
+apply the sealing. This is because the sealed mapping *won’t be unmapped*
-exec system call is invoked. Applications can apply sealing to any virtual
+until the process terminates or the exec system call is invoked.
 memory region from userspace, but it is crucial to thoroughly analyze the
 mapping's lifetime prior to apply the sealing.
 For example:
   - aio/shm
     aio/shm can call mmap and  munmap on behalf of userspace, e.g.
     ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to
     the lifetime of the process. If those memories are sealed from userspace,
     then munmap will fail, causing leaks in VMA address space during the
     lifetime of the process.
- aio/shm
+   - ptr allocated by malloc (heap)
     Don't use mseal on the memory ptr return from malloc().
     malloc() is implemented by allocator, e.g. by glibc. Heap manager might
     allocate a ptr from brk or mapping created by mmap.
     If an app calls mseal on a ptr returned from malloc(), this can affect
     the heap manager's ability to manage the mappings; the outcome is
     non-deterministic.
-  aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
+     Example::
  shm.c. The lifetime of those mapping are not tied to the lifetime of the
  process. If those memories are sealed from userspace, then munmap() will fail,
  causing leaks in VMA address space during the lifetime of the process.
- Brk (heap)
+        ptr = malloc(size);
        /* don't call mseal on ptr return from malloc. */
        mseal(ptr, size);
        /* free will success, allocator can't shrink heap lower than ptr */
        free(ptr);
-  Currently, userspace applications can seal parts of the heap by calling
+mseal doesn't block
-  malloc() and mseal().
+===================
-  let's assume following calls from user space:
+In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's
 attributes, such as protection bits (RWX). Sealed mappings doesn't mean the
 memory is immutable.
  - ptr = malloc(size);
  - mprotect(ptr, size, RO);
  - mseal(ptr, size);
  - free(ptr);
  Technically, before mseal() is added, the user can change the protection of
  the heap by calling mprotect(RO). As long as the user changes the protection
  back to RW before free(), the memory range can be reused.
  Adding mseal() into the picture, however, the heap is then sealed partially,
  the user can still free it, but the memory remains to be RO. If the address
  is re-used by the heap manager for another malloc, the process might crash
  soon after. Therefore, it is important not to apply sealing to any memory
  that might get recycled.
  Furthermore, even if the application never calls the free() for the ptr,
  the heap manager may invoke the brk system call to shrink the size of the
  heap. In the kernel, the brk-shrink will call munmap(). Consequently,
  depending on the location of the ptr, the outcome of brk-shrink is
  nondeterministic.
 Additional notes:
 =================
 As Jann Horn pointed out in [3], there are still a few ways to write
-to RO memory, which is, in a way, by design. Those cases are not covered
+to RO memory, which is, in a way, by design. And those could be blocked
-by mseal(). If applications want to block such cases, sandbox tools (such as
+by different security measures.
 seccomp, LSM, etc) might be considered.
 Those cases are:
- Write to read-only memory through /proc/self/mem interface.
+   - Write to read-only memory through /proc/self/mem interface (FOLL_FORCE).
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
+   - Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
- userfaultfd.
+   - userfaultfd.
 The idea that inspired this patch comes from Stephen Röttger’s work in V8
 CFI [4]. Chrome browser in ChromeOS will be the first user of this API.
-Reference:
+Reference
-==========
+=========
-[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
+- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
-
+- [2] https://man.openbsd.org/mimmutable.2
-[2] https://man.openbsd.org/mimmutable.2
+- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
-
+- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
 [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
 [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
--- a/50
+++ b/50
@ -1,6 +1,4 @@
-Linux kernel
+# Linux Kernel
 ============
 There are several guides for kernel developers and users. These guides can
 be rendered in a number of formats, like HTML and PDF. Please read
 Documentation/admin-guide/README.rst first.
@ -9,6 +7,8 @@ In order to build the documentation, use ``make htmldocs`` or
 ``make pdfdocs``.  The formatted documentation can also be read online at:
    https://www.kernel.org/doc/html/latest/
 ![](Documentation/images/linux_img.png)
 There are various text files in the Documentation/ subdirectory,
 several of them using the reStructuredText markup notation.
@ -16,3 +16,47 @@ several of them using the reStructuredText markup notation.
 Please read the Documentation/process/changes.rst file, as it contains the
 requirements for building and running the kernel, and information about
 the problems which may result by upgrading your kernel.
 ## 📚  Guides and Documentation
 Explore the [`Documentation/`](https://docs.kernel.org/) subdirectory for detailed guides and documentation formatted in reStructuredText.
 Before building or running the kernel, review [`Documentation/process/changes.rst`](Documentation/process/changes.rst) for prerequisites and upgrade information.
 ## ⚙️ Customization
 Customize the Linux kernel using configuration options for features, drivers, and subsystems to meet specific hardware or application requirements.
 ### Benefits
 - Optimization for specific hardware
 - Reduced kernel image size
 - Enhanced security by enabling/disabling features
 ## 🌍 Open Source
 The Linux kernel is open-source under the GNU GPL, fostering a diverse community of contributors and enabling innovation and flexibility.
 ### Features
 - Transparent development process
 - Extensive community support and documentation
 - Customization for various use cases and hardware platforms
 ## 🔒 Security
 The Linux kernel includes robust security features such as access control, memory protection, and support for advanced security modules.
 ### Benefits
 - Address Space Layout Randomization (ASLR)
 - Support for Security-Enhanced Linux (SELinux) and AppArmor
 - Regular security audits and patch management
 ## 📅 Long-Term Support (LTS)
 LTS versions of the Linux kernel receive updates and security patches for approximately two years, providing reliability and stability for long-term deployments.
 ### Benefits
 - Stability prioritized over new features
 - Extended support for critical infrastructure and business applications
 - Smooth upgrade process and community support
 ## 🚗 Driver Support
 The Linux kernel provides in-kernel drivers for a wide range of hardware components and supports third-party drivers for broader hardware compatibility.
 ### Benefits
 - Comprehensive hardware support
 - Plug-and-play functionality
 - Stable API for driver development
--- a/README.md
+++ b/README.md
@ -0,0 +1,62 @@
 # Linux Kernel
 There are several guides for kernel developers and users. These guides can
 be rendered in a number of formats, like HTML and PDF. Please read
 Documentation/admin-guide/README.rst first.
 In order to build the documentation, use ``make htmldocs`` or
 ``make pdfdocs``.  The formatted documentation can also be read online at:
    https://www.kernel.org/doc/html/latest/
 ![](Documentation/images/linux_img.png)
 There are various text files in the Documentation/ subdirectory,
 several of them using the reStructuredText markup notation.
 Please read the Documentation/process/changes.rst file, as it contains the
 requirements for building and running the kernel, and information about
 the problems which may result by upgrading your kernel.
 ## 📚  Guides and Documentation
 Explore the [`Documentation/`](https://docs.kernel.org/) subdirectory for detailed guides and documentation formatted in reStructuredText.
 Before building or running the kernel, review [`Documentation/process/changes.rst`](Documentation/process/changes.rst) for prerequisites and upgrade information.
 ## ⚙️ Customization
 Customize the Linux kernel using configuration options for features, drivers, and subsystems to meet specific hardware or application requirements.
 ### Benefits
 - Optimization for specific hardware
 - Reduced kernel image size
 - Enhanced security by enabling/disabling features
 ## 🌍 Open Source
 The Linux kernel is open-source under the GNU GPL, fostering a diverse community of contributors and enabling innovation and flexibility.
 ### Features
 - Transparent development process
 - Extensive community support and documentation
 - Customization for various use cases and hardware platforms
 ## 🔒 Security
 The Linux kernel includes robust security features such as access control, memory protection, and support for advanced security modules.
 ### Benefits
 - Address Space Layout Randomization (ASLR)
 - Support for Security-Enhanced Linux (SELinux) and AppArmor
 - Regular security audits and patch management
 ## 📅 Long-Term Support (LTS)
 LTS versions of the Linux kernel receive updates and security patches for approximately two years, providing reliability and stability for long-term deployments.
 ### Benefits
 - Stability prioritized over new features
 - Extended support for critical infrastructure and business applications
 - Smooth upgrade process and community support
 ## 🚗 Driver Support
 The Linux kernel provides in-kernel drivers for a wide range of hardware components and supports third-party drivers for broader hardware compatibility.
 ### Benefits
 - Comprehensive hardware support
 - Plug-and-play functionality
 - Stable API for driver development
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs)
 	int ud_type;
 	u32 imm;
 	/*
 	 * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
 	 * is a rare case that uses @regs without passing them to
 	 * irqentry_enter().
 	 */
 	kmsan_unpoison_entry_regs(regs);
 	ud_type = decode_bug(regs->ip, &imm);
 	if (ud_type == BUG_NONE)
 		return handled;
@ -275,6 +269,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
 	 * All lies, just get the WARN/BUG out.
 	 */
 	instrumentation_begin();
 	/*
 	 * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
 	 * is a rare case that uses @regs without passing them to
 	 * irqentry_enter().
 	 */
 	kmsan_unpoison_entry_regs(regs);
 	/*
 	 * Since we're emulating a CALL with exceptions, restore the interrupt
 	 * state to what it was at the exception site.
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@ -674,6 +674,16 @@ EXPORT_SYMBOL_GPL(tpm_chip_register);
 */
 void tpm_chip_unregister(struct tpm_chip *chip)
 {
 #ifdef CONFIG_TCG_TPM2_HMAC
 	int rc;
 	rc = tpm_try_get_ops(chip);
 	if (!rc) {
 		tpm2_end_auth_session(chip);
 		tpm_put_ops(chip);
 	}
 #endif
 	tpm_del_legacy_sysfs(chip);
 	if (tpm_is_hwrng_enabled(chip))
 		hwrng_unregister(&chip->hwrng);
--- a/drivers/char/tpm/tpm-dev-common.c
+++ b/drivers/char/tpm/tpm-dev-common.c
@ -27,6 +27,9 @@ static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space,
 	struct tpm_header *header = (void *)buf;
 	ssize_t ret, len;
 	if (chip->flags & TPM_CHIP_FLAG_TPM2)
 		tpm2_end_auth_session(chip);
 	ret = tpm2_prepare_space(chip, space, buf, bufsiz);
 	/* If the command is not implemented by the TPM, synthesize a
 	 * response with a TPM2_RC_COMMAND_CODE return for user-space.
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@ -379,10 +379,12 @@ int tpm_pm_suspend(struct device *dev)
 	rc = tpm_try_get_ops(chip);
 	if (!rc) {
-		if (chip->flags & TPM_CHIP_FLAG_TPM2)
+		if (chip->flags & TPM_CHIP_FLAG_TPM2) {
 			tpm2_end_auth_session(chip);
 			tpm2_shutdown(chip, TPM2_SU_STATE);
-		else
+		} else {
 			rc = tpm1_pm_suspend(chip, tpm_suspend_pcr);
 		}
 		tpm_put_ops(chip);
 	}
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@ -333,6 +333,9 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
 	}
 #ifdef CONFIG_TCG_TPM2_HMAC
 	/* The first write to /dev/tpm{rm0} will flush the session. */
 	attributes |= TPM2_SA_CONTINUE_SESSION;
 	/*
 	 * The Architecture Guide requires us to strip trailing zeros
 	 * before computing the HMAC
@ -484,7 +487,8 @@ static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v,
 	sha256_final(&sctx, out);
 }
-static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
+static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip,
 				struct tpm2_auth *auth)
 {
 	struct crypto_kpp *kpp;
 	struct kpp_request *req;
@ -543,7 +547,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
 	sg_set_buf(&s[0], chip->null_ec_key_x, EC_PT_SZ);
 	sg_set_buf(&s[1], chip->null_ec_key_y, EC_PT_SZ);
 	kpp_request_set_input(req, s, EC_PT_SZ*2);
-	sg_init_one(d, chip->auth->salt, EC_PT_SZ);
+	sg_init_one(d, auth->salt, EC_PT_SZ);
 	kpp_request_set_output(req, d, EC_PT_SZ);
 	crypto_kpp_compute_shared_secret(req);
 	kpp_request_free(req);
@ -554,8 +558,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
 	 * This works because KDFe fully consumes the secret before it
 	 * writes the salt
 	 */
-	tpm2_KDFe(chip->auth->salt, "SECRET", x, chip->null_ec_key_x,
+	tpm2_KDFe(auth->salt, "SECRET", x, chip->null_ec_key_x, auth->salt);
 		  chip->auth->salt);
 out:
 	crypto_free_kpp(kpp);
@ -853,7 +856,9 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
 		if (rc)
 			/* manually close the session if it wasn't consumed */
 			tpm2_flush_context(chip, auth->handle);
-		memzero_explicit(auth, sizeof(*auth));
+
 		kfree_sensitive(auth);
 		chip->auth = NULL;
 	} else {
 		/* reset for next use  */
 		auth->session = TPM_HEADER_SIZE;
@ -881,7 +886,8 @@ void tpm2_end_auth_session(struct tpm_chip *chip)
 		return;
 	tpm2_flush_context(chip, auth->handle);
-	memzero_explicit(auth, sizeof(*auth));
+	kfree_sensitive(auth);
 	chip->auth = NULL;
 }
 EXPORT_SYMBOL(tpm2_end_auth_session);
@ -915,33 +921,37 @@ static int tpm2_parse_start_auth_session(struct tpm2_auth *auth,
 static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
 {
 	int rc;
 	unsigned int offset = 0; /* dummy offset for null seed context */
 	u8 name[SHA256_DIGEST_SIZE + 2];
 	u32 tmp_null_key;
 	int rc;
 	rc = tpm2_load_context(chip, chip->null_key_context, &offset,
-			       null_key);
+			       &tmp_null_key);
-	if (rc != -EINVAL)
+	if (rc != -EINVAL) {
-		return rc;
+		if (!rc)
 			*null_key = tmp_null_key;
 		goto err;
 	}
-	/* an integrity failure may mean the TPM has been reset */
+	/* Try to re-create null key, given the integrity failure: */
-	dev_err(&chip->dev, "NULL key integrity failure!\n");
+	rc = tpm2_create_primary(chip, TPM2_RH_NULL, &tmp_null_key, name);
-	/* check the null name against what we know */
+	if (rc)
-	tpm2_create_primary(chip, TPM2_RH_NULL, NULL, name);
+		goto err;
-	if (memcmp(name, chip->null_key_name, sizeof(name)) == 0)
+
-		/* name unchanged, assume transient integrity failure */
+	/* Return null key if the name has not been changed: */
-		return rc;
+	if (!memcmp(name, chip->null_key_name, sizeof(name))) {
-	/*
+		*null_key = tmp_null_key;
-	 * Fatal TPM failure: the NULL seed has actually changed, so
+		return 0;
-	 * the TPM must have been illegally reset.  All in-kernel TPM
+	}
-	 * operations will fail because the NULL primary can't be
+
-	 * loaded to salt the sessions, but disable the TPM anyway so
+	/* Deduce from the name change TPM interference: */
-	 * userspace programmes can't be compromised by it.
+	dev_err(&chip->dev, "null key integrity check failed\n");
-	 */
+	tpm2_flush_context(chip, tmp_null_key);
 	dev_err(&chip->dev, "NULL name has changed, disabling TPM due to interference\n");
 	chip->flags |= TPM_CHIP_FLAG_DISABLE;
-	return rc;
+err:
 	return rc ? -ENODEV : 0;
 }
 /**
@ -958,16 +968,20 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
 */
 int tpm2_start_auth_session(struct tpm_chip *chip)
 {
 	struct tpm2_auth *auth;
 	struct tpm_buf buf;
 	struct tpm2_auth *auth = chip->auth;
 	int rc;
 	u32 null_key;
 	int rc;
-	if (!auth) {
+	if (chip->auth) {
-		dev_warn_once(&chip->dev, "auth session is not active\n");
+		dev_warn_once(&chip->dev, "auth session is active\n");
 		return 0;
 	}
 	auth = kzalloc(sizeof(*auth), GFP_KERNEL);
 	if (!auth)
 		return -ENOMEM;
 	rc = tpm2_load_null(chip, &null_key);
 	if (rc)
 		goto out;
@ -988,7 +1002,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
 	tpm_buf_append(&buf, auth->our_nonce, sizeof(auth->our_nonce));
 	/* append encrypted salt and squirrel away unencrypted in auth */
-	tpm_buf_append_salt(&buf, chip);
+	tpm_buf_append_salt(&buf, chip, auth);
 	/* session type (HMAC, audit or policy) */
 	tpm_buf_append_u8(&buf, TPM2_SE_HMAC);
@ -1010,10 +1024,13 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
 	tpm_buf_destroy(&buf);
-	if (rc)
+	if (rc == TPM2_RC_SUCCESS) {
-		goto out;
+		chip->auth = auth;
 		return 0;
 	}
- out:
+out:
 	kfree_sensitive(auth);
 	return rc;
 }
 EXPORT_SYMBOL(tpm2_start_auth_session);
@ -1347,18 +1364,21 @@ static int tpm2_create_null_primary(struct tpm_chip *chip)
 *
 * Derive and context save the null primary and allocate memory in the
 * struct tpm_chip for the authorizations.
 *
 * Return:
 * * 0		- OK
 * * -errno	- A system error
 * * TPM_RC	- A TPM error
 */
 int tpm2_sessions_init(struct tpm_chip *chip)
 {
 	int rc;
 	rc = tpm2_create_null_primary(chip);
-	if (rc)
+	if (rc) {
-		dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
+		dev_err(&chip->dev, "null key creation failed with %d\n", rc);
-
+		return rc;
-	chip->auth = kmalloc(sizeof(*chip->auth), GFP_KERNEL);
+	}
 	if (!chip->auth)
 		return -ENOMEM;
 	return rc;
 }
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@ -3651,7 +3651,7 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
 	enum dma_data_direction dir;
 	struct scsi_data_buffer *sdb = &scp->sdb;
 	u8 *fsp;
-	int i;
+	int i, total = 0;
 	/*
 	 * Even though reads are inherently atomic (in this driver), we expect
@ -3688,18 +3688,16 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
 		   fsp + (block * sdebug_sector_size),
 		   sdebug_sector_size, sg_skip, do_write);
 		sdeb_data_sector_unlock(sip, do_write);
-		if (ret != sdebug_sector_size) {
+		total += ret;
-			ret += (i * sdebug_sector_size);
+		if (ret != sdebug_sector_size)
 			break;
 		}
 		sg_skip += sdebug_sector_size;
 		if (++block >= sdebug_store_sectors)
 			block = 0;
 	}
 	ret = num * sdebug_sector_size;
 	sdeb_data_unlock(sip, atomic);
-	return ret;
+	return total;
 }
 /* Returns number of bytes copied or -1 if error. */
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@ -8219,7 +8219,7 @@ static void ufshcd_update_rtc(struct ufs_hba *hba)
 	err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR, QUERY_ATTR_IDN_SECONDS_PASSED,
 				0, 0, &val);
-	ufshcd_rpm_put_sync(hba);
+	ufshcd_rpm_put(hba);
 	if (err)
 		dev_err(hba->dev, "%s: Failed to update rtc %d\n", __func__, err);
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
 	folio_clear_uptodate(folio);
 	folio_clear_mappedtodisk(folio);
 	folio_clear_checked(folio);
 	head = folio_buffers(folio);
 	if (head) {
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
 		return 0;
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
 		if (byte_start > id_count || byte_start + byte_len > id_count) {
 			ret = -EINVAL;
 			mlog_errno(ret);
 			goto out;
 		}
 		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
 					    byte_start + byte_len, 0);
 		if (ret) {
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
 	}
 }
 void dup_userfaultfd_fail(struct list_head *fcs)
 {
 	struct userfaultfd_fork_ctx *fctx, *n;
 	/*
 	 * An error has occurred on fork, we will tear memory down, but have
 	 * allocated memory for fctx's and raised reference counts for both the
 	 * original and child contexts (and on the mm for each as a result).
 	 *
 	 * These would ordinarily be taken care of by a user handling the event,
 	 * but we are no longer doing so, so manually clean up here.
 	 *
 	 * mm tear down will take care of cleaning up VMA contexts.
 	 */
 	list_for_each_entry_safe(fctx, n, fcs, list) {
 		struct userfaultfd_ctx *octx = fctx->orig;
 		struct userfaultfd_ctx *ctx = fctx->new;
 		atomic_dec(&octx->mmap_changing);
 		VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
 		userfaultfd_ctx_put(octx);
 		userfaultfd_ctx_put(ctx);
 		list_del(&fctx->list);
 		kfree(fctx);
 	}
 }
 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 			     struct vm_userfaultfd_ctx *vm_ctx)
 {
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
 	return atomic_long_read(&mm->ksm_zero_pages);
 }
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	/* Adding mm to ksm is best effort on fork. */
 	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
+		__ksm_enter(mm);
 	return 0;
 }
 static inline int ksm_execve(struct mm_struct *mm)
@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm)
 	return 0;
 }
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	return 0;
 }
 static inline int ksm_execve(struct mm_struct *mm)
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@ -38,6 +38,7 @@
 #else
 #define can_do_masked_user_access() 0
 #define masked_user_access_begin(src) NULL
 #define mask_user_address(src) (src)
 #endif
 /*
@ -159,19 +160,27 @@ _inline_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
 	might_fault();
-	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
+	if (should_fail_usercopy())
 		goto fail;
 	if (can_do_masked_user_access())
 		from = mask_user_address(from);
 	else {
 		if (!access_ok(from, n))
 			goto fail;
 		/*
 		 * Ensure that bad access_ok() speculation will not
 		 * lead to nasty side effects *after* the copy is
 		 * finished:
 		 */
 		barrier_nospec();
 		instrument_copy_from_user_before(to, from, n);
 		res = raw_copy_from_user(to, from, n);
 		instrument_copy_from_user_after(to, from, n, res);
 	}
-	if (unlikely(res))
+	instrument_copy_from_user_before(to, from, n);
-		memset(to + (n - res), 0, res);
+	res = raw_copy_from_user(to, from, n);
 	instrument_copy_from_user_after(to, from, n, res);
 	if (likely(!res))
 		return 0;
 fail:
 	memset(to + (n - res), 0, res);
 	return res;
 }
 extern __must_check unsigned long
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
 void dup_userfaultfd_fail(struct list_head *);
 extern void mremap_userfaultfd_prep(struct vm_area_struct *,
 				    struct vm_userfaultfd_ctx *);
@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
 {
 }
 static inline void dup_userfaultfd_fail(struct list_head *l)
 {
 }
 static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 					   struct vm_userfaultfd_ctx *ctx)
 {
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@ -24,6 +24,23 @@
 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 /*
 * cgroup bpf destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup bpf
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
 static struct workqueue_struct *cgroup_bpf_destroy_wq;
 static int __init cgroup_bpf_wq_init(void)
 {
 	cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
 	if (!cgroup_bpf_destroy_wq)
 		panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
 	return 0;
 }
 core_initcall(cgroup_bpf_wq_init);
 /* __always_inline is necessary to prevent indirect call through run_prog
 * function pointer.
 */
@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
-	queue_work(system_wq, &cgrp->bpf.release_work);
+	queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
 }
 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
 {
 	struct cgroup *cgroup;
 	int ret = false;
-	int level = 1;
+	int level = 0;
 	lockdep_assert_held(&cgroup_mutex);
@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
 		if (cgroup->nr_descendants >= cgroup->max_descendants)
 			goto fail;
-		if (level > cgroup->max_depth)
+		if (level >= cgroup->max_depth)
 			goto fail;
 		level++;
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	mm->exec_vm = oldmm->exec_vm;
 	mm->stack_vm = oldmm->stack_vm;
 	retval = ksm_fork(mm, oldmm);
 	if (retval)
 		goto out;
 	khugepaged_fork(mm, oldmm);
 	/* Use __mt_dup() to efficiently build an identical maple tree. */
 	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
 	if (unlikely(retval))
@ -760,6 +755,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	vma_iter_free(&vmi);
 	if (!retval) {
 		mt_set_in_rcu(vmi.mas.tree);
 		ksm_fork(mm, oldmm);
 		khugepaged_fork(mm, oldmm);
 	} else if (mpnt) {
 		/*
 		 * The entire maple tree has already been duplicated. If the
@ -775,7 +772,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
 	mmap_write_unlock(oldmm);
-	dup_userfaultfd_complete(&uf);
+	if (!retval)
 		dup_userfaultfd_complete(&uf);
 	else
 		dup_userfaultfd_fail(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
--- a/kernel/resource.c
+++ b/kernel/resource.c
@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
 			rams_size += 16;
 		}
-		rams[i].start = res.start;
+		rams[i++] = res;
 		rams[i++].end = res.end;
 		start = res.end + 1;
 	}
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
-static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static int scx_ops_bypass_depth;
 static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
 static bool scx_ops_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@ -4298,18 +4299,20 @@ bool task_should_scx(struct task_struct *p)
 */
 static void scx_ops_bypass(bool bypass)
 {
-	int depth, cpu;
+	int cpu;
 	unsigned long flags;
 	raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
 	if (bypass) {
-		depth = atomic_inc_return(&scx_ops_bypass_depth);
+		scx_ops_bypass_depth++;
-		WARN_ON_ONCE(depth <= 0);
+		WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
-		if (depth != 1)
+		if (scx_ops_bypass_depth != 1)
-			return;
+			goto unlock;
 	} else {
-		depth = atomic_dec_return(&scx_ops_bypass_depth);
+		scx_ops_bypass_depth--;
-		WARN_ON_ONCE(depth < 0);
+		WARN_ON_ONCE(scx_ops_bypass_depth < 0);
-		if (depth != 0)
+		if (scx_ops_bypass_depth != 0)
-			return;
+			goto unlock;
 	}
 	/*
@ -4326,7 +4329,7 @@ static void scx_ops_bypass(bool bypass)
 		struct rq_flags rf;
 		struct task_struct *p, *n;
-		rq_lock_irqsave(rq, &rf);
+		rq_lock(rq, &rf);
 		if (bypass) {
 			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@ -4362,11 +4365,13 @@ static void scx_ops_bypass(bool bypass)
 			sched_enq_and_set_task(&ctx);
 		}
-		rq_unlock_irqrestore(rq, &rf);
+		rq_unlock(rq, &rf);
 		/* resched to restore ticks and idle state */
 		resched_cpu(cpu);
 	}
 unlock:
 	raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
 }
 static void free_exit_info(struct scx_exit_info *ei)
--- a/lib/slub_kunit.c
+++ b/lib/slub_kunit.c
@ -141,7 +141,7 @@ static void test_kmalloc_redzone_access(struct kunit *test)
 {
 	struct kmem_cache *s = test_kmem_cache_create("TestSlub_RZ_kmalloc", 32,
 				SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE);
-	u8 *p = __kmalloc_cache_noprof(s, GFP_KERNEL, 18);
+	u8 *p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 18));
 	kasan_disable_current();
--- a/mm/Kconfig
+++ b/mm/Kconfig
@ -1085,7 +1085,6 @@ config HMM_MIRROR
 	depends on MMU
 config GET_FREE_REGION
 	depends on SPARSEMEM
 	bool
 config DEVICE_PRIVATE
--- a/mm/memory.c
+++ b/mm/memory.c
@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
 /*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *swapcache, *folio = NULL;
 	DECLARE_WAITQUEUE(wait, current);
 	struct page *page;
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 					 * Relax a bit to prevent rapid
 					 * repeated page faults.
 					 */
 					add_wait_queue(&swapcache_wq, &wait);
 					schedule_timeout_uninterruptible(1);
 					remove_wait_queue(&swapcache_wq, &wait);
 					goto out_page;
 				}
 				need_clear_cache = true;
@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
 	/* Clear the swap cache pin for direct swapin after PTL unlock */
-	if (need_clear_cache)
+	if (need_clear_cache) {
 		swapcache_clear(si, entry, nr_pages);
 		if (waitqueue_active(&swapcache_wq))
 			wake_up(&swapcache_wq);
 	}
 	if (si)
 		put_swap_device(si);
 	return ret;
@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		folio_unlock(swapcache);
 		folio_put(swapcache);
 	}
-	if (need_clear_cache)
+	if (need_clear_cache) {
 		swapcache_clear(si, entry, nr_pages);
 		if (waitqueue_active(&swapcache_wq))
 			wake_up(&swapcache_wq);
 	}
 	if (si)
 		put_swap_device(si);
 	return ret;
--- a/mm/mmap.c
+++ b/mm/mmap.c
@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vmg.flags = vm_flags;
 	}
 	/*
 	 * clear PTEs while the vma is still in the tree so that rmap
 	 * cannot race with the freeing later in the truncate scenario.
 	 * This is also needed for call_mmap(), which is why vm_ops
 	 * close function is called.
 	 */
 	vms_clean_up_area(&vms, &mas_detach);
 	vma = vma_merge_new_range(&vmg);
 	if (vma)
 		goto expanded;
@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	if (file) {
 		vma->vm_file = get_file(file);
 		/*
 		 * call_mmap() may map PTE, so ensure there are no existing PTEs
 		 * and call the vm_ops close function if one exists.
 		 */
 		vms_clean_up_area(&vms, &mas_detach);
 		error = call_mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long populate = 0;
 	unsigned long ret = -EINVAL;
 	struct file *file;
 	vm_flags_t vm_flags;
 	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
 		     current->comm, current->pid);
@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
 		return ret;
-	if (mmap_write_lock_killable(mm))
+	if (mmap_read_lock_killable(mm))
 		return -EINTR;
 	/*
 	 * Look up VMA under read lock first so we can perform the security
 	 * without holding locks (which can be problematic). We reacquire a
 	 * write lock later and check nothing changed underneath us.
 	 */
 	vma = vma_lookup(mm, start);
 	if (!vma || !(vma->vm_flags & VM_SHARED)) {
 		mmap_read_unlock(mm);
 		return -EINVAL;
 	}
 	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
 	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
 	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
 	flags &= MAP_NONBLOCK;
 	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
 	if (vma->vm_flags & VM_LOCKED)
 		flags |= MAP_LOCKED;
 	/* Save vm_flags used to calculate prot and flags, and recheck later. */
 	vm_flags = vma->vm_flags;
 	file = get_file(vma->vm_file);
 	mmap_read_unlock(mm);
 	/* Call outside mmap_lock to be consistent with other callers. */
 	ret = security_mmap_file(file, prot, flags);
 	if (ret) {
 		fput(file);
 		return ret;
 	}
 	ret = -EINVAL;
 	/* OK security check passed, take write lock + let it rip. */
 	if (mmap_write_lock_killable(mm)) {
 		fput(file);
 		return -EINTR;
 	}
 	vma = vma_lookup(mm, start);
-	if (!vma || !(vma->vm_flags & VM_SHARED))
+	if (!vma)
 		goto out;
 	/* Make sure things didn't change under us. */
 	if (vma->vm_flags != vm_flags)
 		goto out;
 	if (vma->vm_file != file)
 		goto out;
 	if (start + size > vma->vm_end) {
@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			goto out;
 	}
 	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
 	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
 	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
 	flags &= MAP_NONBLOCK;
 	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
 	if (vma->vm_flags & VM_LOCKED)
 		flags |= MAP_LOCKED;
 	file = get_file(vma->vm_file);
 	ret = security_mmap_file(vma->vm_file, prot, flags);
 	if (ret)
 		goto out_fput;
 	ret = do_mmap(vma->vm_file, start, size,
 			prot, flags, 0, pgoff, &populate, NULL);
 out_fput:
 	fput(file);
 out:
 	mmap_write_unlock(mm);
 	fput(file);
 	if (populate)
 		mm_populate(ret, populate);
 	if (!IS_ERR_VALUE(ret))
@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
 		vmg.prev = vma;
-		vma_iter_next_range(vmi);
+		/* vmi is positioned at prev, which this mode expects. */
 		vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
 		if (vma_merge_new_range(&vmg))
 			goto out;
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
 	for_each_reserved_mem_region(mb_region) {
 		int nid = memblock_get_region_node(mb_region);
-		if (nid != MAX_NUMNODES)
+		if (numa_valid_node(nid))
 			node_set(nid, reserved_nodemask);
 	}
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 			page = __rmqueue(zone, order, migratetype, alloc_flags);
 			/*
-			 * If the allocation fails, allow OOM handling access
+			 * If the allocation fails, allow OOM handling and
-			 * to HIGHATOMIC reserves as failing now is worse than
+			 * order-0 (atomic) allocs access to HIGHATOMIC
-			 * failing a high-order atomic allocation in the
+			 * reserves as failing now is worse than failing a
-			 * future.
+			 * high-order atomic allocation in the future.
 			 */
-			if (!page && (alloc_flags & ALLOC_OOM))
+			if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
 				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 			if (!page) {
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 	pud = pudp_get(pudp);
 	if (pud_none(pud))
 		goto not_found;
-	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
+	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
 	    (!pud_present(pud) || pud_leaf(pud))) {
 		ptl = pud_lock(vma->vm_mm, pudp);
 		pud = pudp_get(pudp);
@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 		fw->pudp = pudp;
 		fw->pud = pud;
 		/*
 		 * TODO: FW_MIGRATION support for PUD migration entries
 		 * once there are relevant users.
 		 */
 		if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
 			spin_unlock(ptl);
 			goto not_found;
@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 	}
 pmd_table:
-	VM_WARN_ON_ONCE(pud_leaf(*pudp));
+	VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
 	pmdp = pmd_offset(pudp, addr);
 	pmd = pmdp_get_lockless(pmdp);
 	if (pmd_none(pmd))
 		goto not_found;
-	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
+	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
 	    (!pmd_present(pmd) || pmd_leaf(pmd))) {
 		ptl = pmd_lock(vma->vm_mm, pmdp);
 		pmd = pmdp_get(pmdp);
@ -786,7 +792,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 		if (pmd_none(pmd)) {
 			spin_unlock(ptl);
 			goto not_found;
-		} else if (!pmd_leaf(pmd)) {
+		} else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
 			spin_unlock(ptl);
 			goto pte_table;
 		} else if (pmd_present(pmd)) {
@ -812,7 +818,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 	}
 pte_table:
-	VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
+	VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
 	ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	if (!ptep)
 		goto not_found;
--- a/mm/shmem.c
+++ b/mm/shmem.c
@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
 			STATX_ATTR_IMMUTABLE |
 			STATX_ATTR_NODUMP);
 	inode_lock_shared(inode);
 	generic_fillattr(idmap, request_mask, inode, stat);
 	inode_unlock_shared(inode);
 	if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
 		stat->blksize = HPAGE_PMD_SIZE;
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@ -1209,7 +1209,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
 		/* Zero out spare memory. */
 		if (want_init_on_alloc(flags)) {
 			kasan_disable_current();
-			memset((void *)p + new_size, 0, ks - new_size);
+			memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
 			kasan_enable_current();
 		}
--- a/mm/vma.c
+++ b/mm/vma.c
@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	pgoff_t pgoff = vmg->pgoff;
 	pgoff_t pglen = PHYS_PFN(end - start);
 	bool can_merge_left, can_merge_right;
 	bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
 	mmap_assert_write_locked(vmg->mm);
 	VM_WARN_ON(vmg->vma);
@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 		return NULL;
 	can_merge_left = can_vma_merge_left(vmg);
-	can_merge_right = can_vma_merge_right(vmg, can_merge_left);
+	can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
 	/* If we can merge with the next VMA, adjust vmg accordingly. */
 	if (can_merge_right) {
@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 		if (can_merge_right && !can_merge_remove_vma(next))
 			vmg->end = end;
-		vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
+		/* In expand-only case we are already positioned at prev. */
 		if (!just_expand) {
 			/* Equivalent to going to the previous range. */
 			vma_prev(vmg->vmi);
 		}
 	}
 	/*
@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	}
 	/* If expansion failed, reset state. Allows us to retry merge later. */
-	vmg->vma = NULL;
+	if (!just_expand) {
-	vmg->start = start;
+		vmg->vma = NULL;
-	vmg->end = end;
+		vmg->start = start;
-	vmg->pgoff = pgoff;
+		vmg->end = end;
-	if (vmg->vma == prev)
+		vmg->pgoff = pgoff;
-		vma_iter_set(vmg->vmi, start);
+		if (vmg->vma == prev)
 			vma_iter_set(vmg->vmi, start);
 	}
 	return NULL;
 }
--- a/mm/vma.h
+++ b/mm/vma.h
@ -59,6 +59,17 @@ enum vma_merge_state {
 	VMA_MERGE_SUCCESS,
 };
 enum vma_merge_flags {
 	VMG_FLAG_DEFAULT = 0,
 	/*
 	 * If we can expand, simply do so. We know there is nothing to merge to
 	 * the right. Does not reset state upon failure to merge. The VMA
 	 * iterator is assumed to be positioned at the previous VMA, rather than
 	 * at the gap.
 	 */
 	VMG_FLAG_JUST_EXPAND = 1 << 0,
 };
 /* Represents a VMA merge operation. */
 struct vma_merge_struct {
 	struct mm_struct *mm;
@ -75,6 +86,7 @@ struct vma_merge_struct {
 	struct mempolicy *policy;
 	struct vm_userfaultfd_ctx uffd_ctx;
 	struct anon_vma_name *anon_name;
 	enum vma_merge_flags merge_flags;
 	enum vma_merge_state state;
 };
@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.flags = flags_,					\
 		.pgoff = pgoff_,					\
 		.state = VMA_MERGE_START,				\
 		.merge_flags = VMG_FLAG_DEFAULT,			\
 	}
 #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_)	\
@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.uffd_ctx = vma_->vm_userfaultfd_ctx,		\
 		.anon_name = anon_vma_name(vma_),		\
 		.state = VMA_MERGE_START,			\
 		.merge_flags = VMG_FLAG_DEFAULT,		\
 	}
 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
 	 * failure method of leaving a gap where the MAP_FIXED mapping failed.
 	 */
 	mas_set_range(mas, vms->start, vms->end - 1);
-	if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
+	mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
-		pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
+	/* Clean up the insertion of the unfortunate gap */
-			     current->comm, current->pid);
+	vms_complete_munmap_vmas(vms, mas_detach);
 		/* Leaving vmas detached and in-tree may hamper recovery */
 		reattach_vmas(mas_detach);
 	} else {
 		/* Clean up the insertion of the unfortunate gap */
 		vms_complete_munmap_vmas(vms, mas_detach);
 	}
 }
 int
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@ -94,6 +94,7 @@
 #define ARM_CPU_PART_NEOVERSE_V3	0xD84
 #define ARM_CPU_PART_CORTEX_X925	0xD85
 #define ARM_CPU_PART_CORTEX_A725	0xD87
 #define ARM_CPU_PART_NEOVERSE_N3	0xD8E
 #define APM_CPU_PART_XGENE		0x000
 #define APM_CPU_VAR_POTENZA		0x00
@ -176,6 +177,7 @@
 #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3)
 #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925)
 #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725)
 #define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@ -36,6 +36,20 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 /*
 * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
 * Most MSRs support/allow only a subset of memory types, but the values
 * themselves are common across all relevant MSRs.
 */
 #define X86_MEMTYPE_UC		0ull	/* Uncacheable, a.k.a. Strong Uncacheable */
 #define X86_MEMTYPE_WC		1ull	/* Write Combining */
 /* RESERVED			2 */
 /* RESERVED			3 */
 #define X86_MEMTYPE_WT		4ull	/* Write Through */
 #define X86_MEMTYPE_WP		5ull	/* Write Protected */
 #define X86_MEMTYPE_WB		6ull	/* Write Back */
 #define X86_MEMTYPE_UC_MINUS	7ull	/* Weak Uncacheabled (PAT only) */
 /* FRED MSRs */
 #define MSR_IA32_FRED_RSP0	0x1cc			/* Level 0 stack pointer */
 #define MSR_IA32_FRED_RSP1	0x1cd			/* Level 1 stack pointer */
@ -365,6 +379,12 @@
 #define MSR_IA32_CR_PAT			0x00000277
 #define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7)			\
 	((X86_MEMTYPE_ ## p0)      | (X86_MEMTYPE_ ## p1 << 8)  |	\
 	(X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) |	\
 	(X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) |	\
 	(X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
 #define MSR_IA32_DEBUGCTLMSR		0x000001d9
 #define MSR_IA32_LASTBRANCHFROMIP	0x000001db
 #define MSR_IA32_LASTBRANCHTOIP		0x000001dc
@ -1159,15 +1179,6 @@
 #define MSR_IA32_VMX_VMFUNC             0x00000491
 #define MSR_IA32_VMX_PROCBASED_CTLS3	0x00000492
 /* VMX_BASIC bits and bitmasks */
 #define VMX_BASIC_VMCS_SIZE_SHIFT	32
 #define VMX_BASIC_TRUE_CTLS		(1ULL << 55)
 #define VMX_BASIC_64		0x0001000000000000LLU
 #define VMX_BASIC_MEM_TYPE_SHIFT	50
 #define VMX_BASIC_MEM_TYPE_MASK	0x003c000000000000LLU
 #define VMX_BASIC_MEM_TYPE_WB	6LLU
 #define VMX_BASIC_INOUT		0x0040000000000000LLU
 /* Resctrl MSRs: */
 /* - Intel: */
 #define MSR_IA32_L3_QOS_CFG		0xc81
@ -1185,11 +1196,6 @@
 #define MSR_IA32_SMBA_BW_BASE		0xc0000280
 #define MSR_IA32_EVT_CFG_BASE		0xc0000400
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_INTEL_PT                 (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
 #define MSR_VM_CR                       0xc0010114
 #define MSR_VM_IGNNE                    0xc0010115
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@ -439,6 +439,7 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT	(1 << 4)
 #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN	(1 << 5)
 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6)
 #define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7)
 #define KVM_STATE_NESTED_FORMAT_VMX	0
 #define KVM_STATE_NESTED_FORMAT_SVM	1
--- a/tools/arch/x86/include/uapi/asm/unistd_32.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_32.h
@ -11,6 +11,9 @@
 #ifndef __NR_getpgid
 #define __NR_getpgid 132
 #endif
 #ifndef __NR_capget
 #define __NR_capget 184
 #endif
 #ifndef __NR_gettid
 #define __NR_gettid 224
 #endif
--- a/tools/arch/x86/include/uapi/asm/unistd_64.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_64.h
@ -11,6 +11,9 @@
 #ifndef __NR_getpgid
 #define __NR_getpgid 121
 #endif
 #ifndef __NR_capget
 #define __NR_capget 125
 #endif
 #ifndef __NR_gettid
 #define __NR_gettid 186
 #endif
--- a/tools/include/linux/bits.h
+++ b/tools/include/linux/bits.h
@ -36,4 +36,19 @@
 #define GENMASK_ULL(h, l) \
 	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
 #if !defined(__ASSEMBLY__)
 /*
 * Missing asm support
 *
 * __GENMASK_U128() depends on _BIT128() which would not work
 * in the asm code, as it shifts an 'unsigned __init128' data
 * type instead of direct representation of 128 bit constants
 * such as long and unsigned long. The fundamental problem is
 * that a 128 bit constant will get silently truncated by the
 * gcc compiler.
 */
 #define GENMASK_U128(h, l) \
 	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l))
 #endif
 #endif	/* __LINUX_BITS_H */
--- a/tools/include/linux/unaligned.h
+++ b/tools/include/linux/unaligned.h
@ -9,16 +9,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpacked"
 #pragma GCC diagnostic ignored "-Wattributes"
-
+#include <vdso/unaligned.h>
 #define __get_unaligned_t(type, ptr) ({						\
 	const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);	\
 	__pptr->x;								\
 })
 #define __put_unaligned_t(type, val, ptr) do {					\
 	struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);		\
 	__pptr->x = (val);							\
 } while (0)
 #define get_unaligned(ptr)	__get_unaligned_t(typeof(*(ptr)), (ptr))
 #define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr))
--- a/tools/include/uapi/linux/bits.h
+++ b/tools/include/uapi/linux/bits.h
@ -12,4 +12,7 @@
        (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \
         (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h))))
 #define __GENMASK_U128(h, l) \
 	((_BIT128((h)) << 1) - (_BIT128(l)))
 #endif /* _UAPI_LINUX_BITS_H */
--- a/tools/include/uapi/linux/const.h
+++ b/tools/include/uapi/linux/const.h
@ -28,6 +28,23 @@
 #define _BITUL(x)	(_UL(1) << (x))
 #define _BITULL(x)	(_ULL(1) << (x))
 #if !defined(__ASSEMBLY__)
 /*
 * Missing asm support
 *
 * __BIT128() would not work in the asm code, as it shifts an
 * 'unsigned __init128' data type as direct representation of
 * 128 bit constants is not supported in the gcc compiler, as
 * they get silently truncated.
 *
 * TODO: Please revisit this implementation when gcc compiler
 * starts representing 128 bit constants directly like long
 * and unsigned long etc. Subsequently drop the comment for
 * GENMASK_U128() which would then start supporting asm code.
 */
 #define _BIT128(x)	((unsigned __int128)(1) << (x))
 #endif
 #define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
--- a/tools/include/vdso/unaligned.h
+++ b/tools/include/vdso/unaligned.h
@ -0,0 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __VDSO_UNALIGNED_H
 #define __VDSO_UNALIGNED_H
 #define __get_unaligned_t(type, ptr) ({						\
 	const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);	\
 	__pptr->x;								\
 })
 #define __put_unaligned_t(type, val, ptr) do {					\
 	struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);		\
 	__pptr->x = (val);							\
 } while (0)
 #endif /* __VDSO_UNALIGNED_H */
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@ -704,8 +704,8 @@ ifeq ($(BUILD_BPF_SKEL),1)
    BUILD_BPF_SKEL := 0
  else
    CLANG_VERSION := $(shell $(CLANG) --version | head -1 | sed 's/.*clang version \([[:digit:]]\+.[[:digit:]]\+.[[:digit:]]\+\).*/\1/g')
-    ifeq ($(call version-lt3,$(CLANG_VERSION),16.0.6),1)
+    ifeq ($(call version-lt3,$(CLANG_VERSION),12.0.1),1)
-      $(warning Warning: Disabled BPF skeletons as at least $(CLANG) version 16.0.6 is reported to be a working setup with the current of BPF based perf features)
+      $(warning Warning: Disabled BPF skeletons as reliable BTF generation needs at least $(CLANG) version 12.0.1)
      BUILD_BPF_SKEL := 0
    endif
  endif
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@ -1399,7 +1399,7 @@ static const struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 	{ .name	    = "waitid",	    .errpid = true,
 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
-	{ .name	    = "write",	    .errpid = true,
+	{ .name	    = "write",
 	  .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
 };
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@ -22,6 +22,7 @@ FILES=(
  "include/vdso/bits.h"
  "include/linux/const.h"
  "include/vdso/const.h"
  "include/vdso/unaligned.h"
  "include/linux/hash.h"
  "include/linux/list-sort.h"
  "include/uapi/linux/hw_breakpoint.h"
--- a/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh
+++ b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh
@ -19,35 +19,74 @@
 TEST_RESULT=0
 # skip if not supported
-BLACKFUNC=`head -n 1 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
+BLACKFUNC_LIST=`head -n 5 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
-if [ -z "$BLACKFUNC" ]; then
+if [ -z "$BLACKFUNC_LIST" ]; then
 	print_overall_skipped
 	exit 0
 fi
 # try to find vmlinux with DWARF debug info
 VMLINUX_FILE=$(perf probe -v random_probe |& grep "Using.*for symbols" | sed -r 's/^Using (.*) for symbols$/\1/')
 # remove all previously added probes
 clear_all_probes
 ### adding blacklisted function
 # functions from blacklist should be skipped by perf probe
 ! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
 PERF_EXIT_CODE=$?
 REGEX_SCOPE_FAIL="Failed to find scope of probe point"
 REGEX_SKIP_MESSAGE=" is blacklisted function, skip it\."
-REGEX_NOT_FOUND_MESSAGE="Probe point \'$BLACKFUNC\' not found."
+REGEX_NOT_FOUND_MESSAGE="Probe point \'$RE_EVENT\' not found."
 REGEX_ERROR_MESSAGE="Error: Failed to add events."
 REGEX_INVALID_ARGUMENT="Failed to write event: Invalid argument"
 REGEX_SYMBOL_FAIL="Failed to find symbol at $RE_ADDRESS"
-REGEX_OUT_SECTION="$BLACKFUNC is out of \.\w+, skip it"
+REGEX_OUT_SECTION="$RE_EVENT is out of \.\w+, skip it"
-../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
+REGEX_MISSING_DECL_LINE="A function DIE doesn't have decl_line. Maybe broken DWARF?"
 CHECK_EXIT_CODE=$?
-print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
+BLACKFUNC=""
-(( TEST_RESULT += $? ))
+SKIP_DWARF=0
 for BLACKFUNC in $BLACKFUNC_LIST; do
 	echo "Probing $BLACKFUNC"
 	# functions from blacklist should be skipped by perf probe
 	! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
 	PERF_EXIT_CODE=$?
 	# check for bad DWARF polluting the result
 	../common/check_all_patterns_found.pl "$REGEX_MISSING_DECL_LINE" >/dev/null < $LOGS_DIR/adding_blacklisted.err
 	if [ $? -eq 0 ]; then
 		SKIP_DWARF=1
 		echo "Result polluted by broken DWARF, trying another probe"
 		# confirm that the broken DWARF comes from assembler
 		if [ -n "$VMLINUX_FILE" ]; then
 			readelf -wi "$VMLINUX_FILE" |
 			awk -v probe="$BLACKFUNC" '/DW_AT_language/ { comp_lang = $0 }
 						   $0 ~ probe { if (comp_lang) { print comp_lang }; exit }' |
 			grep -q "MIPS assembler"
 			CHECK_EXIT_CODE=$?
 			if [ $CHECK_EXIT_CODE -ne 0 ]; then
 				SKIP_DWARF=0 # broken DWARF while available
 				break
 			fi
 		fi
 	else
 		../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
 		CHECK_EXIT_CODE=$?
 		SKIP_DWARF=0
 		break
 	fi
 done
 if [ $SKIP_DWARF -eq 1 ]; then
 	print_testcase_skipped "adding blacklisted function $BLACKFUNC"
 else
 	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
 	(( TEST_RESULT += $? ))
 fi
 ### listing not-added probe
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@ -288,6 +288,10 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
 	len += augmented_args->arg.size;
 	/* Every read from userspace is limited to value size */
 	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
 		return 1; /* Failure: don't filter */
 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
@ -315,6 +319,10 @@ int sys_enter_renameat2(struct syscall_enter_args *args)
 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
 	len += augmented_args->arg.size;
 	/* Every read from userspace is limited to value size */
 	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
 		return 1; /* Failure: don't filter */
 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
@ -423,8 +431,9 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
 	bool augmented, do_output = false;
-	int zero = 0, size, aug_size, index, output = 0,
+	int zero = 0, size, aug_size, index,
 	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
 	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
 	unsigned int nr, *beauty_map;
 	struct beauty_payload_enter *payload;
 	void *arg, *payload_offset;
@ -477,6 +486,8 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 				augmented = true;
 		} else if (size < 0 && size >= -6) { /* buffer */
 			index = -(size + 1);
 			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
 			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
 			aug_size = args->args[index];
 			if (aug_size > TRACE_AUG_MAX_BUF)
@ -488,10 +499,17 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 			}
 		}
 		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
 		if (aug_size > value_size)
 			aug_size = value_size;
 		/* write data to payload */
 		if (augmented) {
 			int written = offsetof(struct augmented_arg, value) + aug_size;
 			if (written < 0 || written > sizeof(struct augmented_arg))
 				return 1;
 			((struct augmented_arg *)payload_offset)->size = aug_size;
 			output += written;
 			payload_offset += written;
@ -499,7 +517,7 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 		}
 	}
-	if (!do_output)
+	if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
 		return 1;
 	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
--- a/tools/perf/util/cap.c
+++ b/tools/perf/util/cap.c
@ -7,13 +7,9 @@
 #include "debug.h"
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
 #include <linux/capability.h>
 #include <sys/syscall.h>
-
+#include <unistd.h>
 #ifndef SYS_capget
 #define SYS_capget 90
 #endif
 #define MAX_LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
@ -21,9 +17,9 @@ bool perf_cap__capable(int cap, bool *used_root)
 {
 	struct __user_cap_header_struct header = {
 		.version = _LINUX_CAPABILITY_VERSION_3,
-		.pid = getpid(),
+		.pid = 0,
 	};
-	struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S];
+	struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S] = {};
 	__u32 cap_val;
 	*used_root = false;
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@ -19,6 +19,7 @@
 #include "util/bpf-filter.h"
 #include "util/env.h"
 #include "util/kvm-stat.h"
 #include "util/stat.h"
 #include "util/kwork.h"
 #include "util/sample.h"
 #include "util/lock-contention.h"
@ -1355,6 +1356,7 @@ PyMODINIT_FUNC PyInit_perf(void)
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 #ifdef HAVE_KVM_STAT_SUPPORT
 bool kvm_entry_event(struct evsel *evsel __maybe_unused)
 {
 	return false;
@ -1384,6 +1386,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 			   char *decode __maybe_unused)
 {
 }
 #endif // HAVE_KVM_STAT_SUPPORT
 int find_scripts(char **scripts_array  __maybe_unused, char **scripts_path_array  __maybe_unused,
 		int num  __maybe_unused, int pathlen __maybe_unused)
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@ -46,6 +46,11 @@ static const char *const *syscalltbl_native = syscalltbl_mips_n64;
 #include <asm/syscalls.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_LOONGARCH_MAX_ID;
 static const char *const *syscalltbl_native = syscalltbl_loongarch;
 #else
 const int syscalltbl_native_max_id = 0;
 static const char *const syscalltbl_native[] = {
 	[0] = "unknown",
 };
 #endif
 struct syscall {
@ -182,6 +187,11 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name)
 	return audit_name_to_syscall(name, tbl->audit_machine);
 }
 int syscalltbl__id_at_idx(struct syscalltbl *tbl __maybe_unused, int idx)
 {
 	return idx;
 }
 int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused,
 				  const char *syscall_glob __maybe_unused, int *idx __maybe_unused)
 {
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
 /*
 * Access a cpumask in read-only mode (typically to check bits).
 */
-const struct cpumask *cast_mask(struct bpf_cpumask *mask)
+static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
 {
 	return (const struct cpumask *)mask;
 }
--- a/tools/testing/selftests/mm/uffd-common.c
+++ b/tools/testing/selftests/mm/uffd-common.c
@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
 unsigned long long *count_verify;
 uffd_test_ops_t *uffd_test_ops;
 uffd_test_case_ops_t *uffd_test_case_ops;
-pthread_barrier_t ready_for_fork;
+atomic_bool ready_for_fork;
 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
 {
@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
 	pollfd[1].fd = pipefd[cpu*2];
 	pollfd[1].events = POLLIN;
-	/* Ready for parent thread to fork */
+	ready_for_fork = true;
 	pthread_barrier_wait(&ready_for_fork);
 	for (;;) {
 		ret = poll(pollfd, 2, -1);
--- a/tools/testing/selftests/mm/uffd-common.h
+++ b/tools/testing/selftests/mm/uffd-common.h
@ -33,6 +33,7 @@
 #include <inttypes.h>
 #include <stdint.h>
 #include <sys/random.h>
 #include <stdatomic.h>
 #include "../kselftest.h"
 #include "vm_util.h"
@ -104,7 +105,7 @@ extern bool map_shared;
 extern bool test_uffdio_wp;
 extern unsigned long long *count_verify;
 extern volatile bool test_uffdio_copy_eexist;
-extern pthread_barrier_t ready_for_fork;
+extern atomic_bool ready_for_fork;
 extern uffd_test_ops_t anon_uffd_test_ops;
 extern uffd_test_ops_t shmem_uffd_test_ops;
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@ -241,8 +241,7 @@ static void *fork_event_consumer(void *data)
 	fork_event_args *args = data;
 	struct uffd_msg msg = { 0 };
-	/* Ready for parent thread to fork */
+	ready_for_fork = true;
 	pthread_barrier_wait(&ready_for_fork);
 	/* Read until a full msg received */
 	while (uffd_read_msg(args->parent_uffd, &msg));
@ -311,12 +310,11 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
 	/* Prepare a thread to resolve EVENT_FORK */
 	if (with_event) {
-		pthread_barrier_init(&ready_for_fork, NULL, 2);
+		ready_for_fork = false;
 		if (pthread_create(&thread, NULL, fork_event_consumer, &args))
 			err("pthread_create()");
-		/* Wait for child thread to start before forking */
+		while (!ready_for_fork)
-		pthread_barrier_wait(&ready_for_fork);
+			; /* Wait for the poll_thread to start executing before forking */
 		pthread_barrier_destroy(&ready_for_fork);
 	}
 	child = fork();
@ -781,7 +779,7 @@ static void uffd_sigbus_test_common(bool wp)
 	char c;
 	struct uffd_args args = { 0 };
-	pthread_barrier_init(&ready_for_fork, NULL, 2);
+	ready_for_fork = false;
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
@ -798,9 +796,8 @@ static void uffd_sigbus_test_common(bool wp)
 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
 		err("uffd_poll_thread create");
-	/* Wait for child thread to start before forking */
+	while (!ready_for_fork)
-	pthread_barrier_wait(&ready_for_fork);
+		; /* Wait for the poll_thread to start executing before forking */
 	pthread_barrier_destroy(&ready_for_fork);
 	pid = fork();
 	if (pid < 0)
@ -841,7 +838,7 @@ static void uffd_events_test_common(bool wp)
 	char c;
 	struct uffd_args args = { 0 };
-	pthread_barrier_init(&ready_for_fork, NULL, 2);
+	ready_for_fork = false;
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
@ -852,9 +849,8 @@ static void uffd_events_test_common(bool wp)
 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
 		err("uffd_poll_thread create");
-	/* Wait for child thread to start before forking */
+	while (!ready_for_fork)
-	pthread_barrier_wait(&ready_for_fork);
+		; /* Wait for the poll_thread to start executing before forking */
 	pthread_barrier_destroy(&ready_for_fork);
 	pid = fork();
 	if (pid < 0)
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@ -184,7 +184,7 @@ auto-test-targets :=			\
 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
-$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) $(BPFOBJ)
 	$(CC) $(CFLAGS) -c $< -o $@
 # Create all of the test targets object files, whose testcase objects will be
--- a/tools/testing/selftests/sched_ext/create_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c
@ -51,8 +51,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init)
 SEC(".struct_ops.link")
 struct sched_ext_ops create_dsq_ops = {
-	.init_task		= create_dsq_init_task,
+	.init_task		= (void *) create_dsq_init_task,
-	.exit_task		= create_dsq_exit_task,
+	.exit_task		= (void *) create_dsq_exit_task,
-	.init			= create_dsq_init,
+	.init			= (void *) create_dsq_init,
 	.name			= "create_dsq",
 };
--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
@ -35,8 +35,8 @@ void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
-	.select_cpu		= ddsp_bogus_dsq_fail_select_cpu,
+	.select_cpu		= (void *) ddsp_bogus_dsq_fail_select_cpu,
-	.exit			= ddsp_bogus_dsq_fail_exit,
+	.exit			= (void *) ddsp_bogus_dsq_fail_exit,
 	.name			= "ddsp_bogus_dsq_fail",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
@ -32,8 +32,8 @@ void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
-	.select_cpu		= ddsp_vtimelocal_fail_select_cpu,
+	.select_cpu		= (void *) ddsp_vtimelocal_fail_select_cpu,
-	.exit			= ddsp_vtimelocal_fail_exit,
+	.exit			= (void *) ddsp_vtimelocal_fail_exit,
 	.name			= "ddsp_vtimelocal_fail",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
@ -56,10 +56,10 @@ void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops dsp_local_on_ops = {
-	.select_cpu		= dsp_local_on_select_cpu,
+	.select_cpu		= (void *) dsp_local_on_select_cpu,
-	.enqueue		= dsp_local_on_enqueue,
+	.enqueue		= (void *) dsp_local_on_enqueue,
-	.dispatch		= dsp_local_on_dispatch,
+	.dispatch		= (void *) dsp_local_on_dispatch,
-	.exit			= dsp_local_on_exit,
+	.exit			= (void *) dsp_local_on_exit,
 	.name			= "dsp_local_on",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
@ -12,10 +12,18 @@
 char _license[] SEC("license") = "GPL";
 u32 exit_kind;
 void BPF_STRUCT_OPS_SLEEPABLE(enq_last_no_enq_fails_exit, struct scx_exit_info *info)
 {
 	exit_kind = info->kind;
 }
 SEC(".struct_ops.link")
 struct sched_ext_ops enq_last_no_enq_fails_ops = {
 	.name			= "enq_last_no_enq_fails",
 	/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
 	.flags			= SCX_OPS_ENQ_LAST,
 	.exit			= (void *) enq_last_no_enq_fails_exit,
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
@ -31,8 +31,12 @@ static enum scx_test_status run(void *ctx)
 	struct bpf_link *link;
 	link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
-	if (link) {
+	if (!link) {
-		SCX_ERR("Incorrectly succeeded in to attaching scheduler");
+		SCX_ERR("Incorrectly failed at attaching scheduler");
 		return SCX_TEST_FAIL;
 	}
 	if (!skel->bss->exit_kind) {
 		SCX_ERR("Incorrectly stayed loaded");
 		return SCX_TEST_FAIL;
 	}
@ -50,7 +54,7 @@ static void cleanup(void *ctx)
 struct scx_test enq_last_no_enq_fails = {
 	.name = "enq_last_no_enq_fails",
-	.description = "Verify we fail to load a scheduler if we specify "
+	.description = "Verify we eject a scheduler if we specify "
 		       "the SCX_OPS_ENQ_LAST flag without defining "
 		       "ops.enqueue()",
 	.setup = setup,
--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
@ -36,8 +36,8 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
 SEC(".struct_ops.link")
 struct sched_ext_ops enq_select_cpu_fails_ops = {
-	.select_cpu		= enq_select_cpu_fails_select_cpu,
+	.select_cpu		= (void *) enq_select_cpu_fails_select_cpu,
-	.enqueue		= enq_select_cpu_fails_enqueue,
+	.enqueue		= (void *) enq_select_cpu_fails_enqueue,
 	.name			= "enq_select_cpu_fails",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/exit.bpf.c
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@ -15,6 +15,8 @@ UEI_DEFINE(uei);
 #define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point)
 #define DSQ_ID 0
 s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@ -31,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
 	if (exit_point == EXIT_ENQUEUE)
 		EXIT_CLEANLY();
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
 }
 void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
@ -39,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
 	if (exit_point == EXIT_DISPATCH)
 		EXIT_CLEANLY();
-	scx_bpf_consume(SCX_DSQ_GLOBAL);
+	scx_bpf_consume(DSQ_ID);
 }
 void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
@ -67,18 +69,18 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init)
 	if (exit_point == EXIT_INIT)
 		EXIT_CLEANLY();
-	return 0;
+	return scx_bpf_create_dsq(DSQ_ID, -1);
 }
 SEC(".struct_ops.link")
 struct sched_ext_ops exit_ops = {
-	.select_cpu		= exit_select_cpu,
+	.select_cpu		= (void *) exit_select_cpu,
-	.enqueue		= exit_enqueue,
+	.enqueue		= (void *) exit_enqueue,
-	.dispatch		= exit_dispatch,
+	.dispatch		= (void *) exit_dispatch,
-	.init_task		= exit_init_task,
+	.init_task		= (void *) exit_init_task,
-	.enable			= exit_enable,
+	.enable			= (void *) exit_enable,
-	.exit			= exit_exit,
+	.exit			= (void *) exit_exit,
-	.init			= exit_init,
+	.init			= (void *) exit_init,
 	.name			= "exit",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/hotplug.bpf.c
+++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c
@ -46,16 +46,16 @@ void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu)
 SEC(".struct_ops.link")
 struct sched_ext_ops hotplug_cb_ops = {
-	.cpu_online		= hotplug_cpu_online,
+	.cpu_online		= (void *) hotplug_cpu_online,
-	.cpu_offline		= hotplug_cpu_offline,
+	.cpu_offline		= (void *) hotplug_cpu_offline,
-	.exit			= hotplug_exit,
+	.exit			= (void *) hotplug_exit,
 	.name			= "hotplug_cbs",
 	.timeout_ms		= 1000U,
 };
 SEC(".struct_ops.link")
 struct sched_ext_ops hotplug_nocb_ops = {
-	.exit			= hotplug_exit,
+	.exit			= (void *) hotplug_exit,
 	.name			= "hotplug_nocbs",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
+++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
@ -45,9 +45,9 @@ void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)
 SEC(".struct_ops.link")
 struct sched_ext_ops init_enable_count_ops = {
-	.init_task	= cnt_init_task,
+	.init_task	= (void *) cnt_init_task,
-	.exit_task	= cnt_exit_task,
+	.exit_task	= (void *) cnt_exit_task,
-	.enable		= cnt_enable,
+	.enable		= (void *) cnt_enable,
-	.disable	= cnt_disable,
+	.disable	= (void *) cnt_disable,
 	.name		= "init_enable_count",
 };
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@ -131,34 +131,34 @@ void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
 SEC(".struct_ops.link")
 struct sched_ext_ops maximal_ops = {
-	.select_cpu		= maximal_select_cpu,
+	.select_cpu		= (void *) maximal_select_cpu,
-	.enqueue		= maximal_enqueue,
+	.enqueue		= (void *) maximal_enqueue,
-	.dequeue		= maximal_dequeue,
+	.dequeue		= (void *) maximal_dequeue,
-	.dispatch		= maximal_dispatch,
+	.dispatch		= (void *) maximal_dispatch,
-	.runnable		= maximal_runnable,
+	.runnable		= (void *) maximal_runnable,
-	.running		= maximal_running,
+	.running		= (void *) maximal_running,
-	.stopping		= maximal_stopping,
+	.stopping		= (void *) maximal_stopping,
-	.quiescent		= maximal_quiescent,
+	.quiescent		= (void *) maximal_quiescent,
-	.yield			= maximal_yield,
+	.yield			= (void *) maximal_yield,
-	.core_sched_before	= maximal_core_sched_before,
+	.core_sched_before	= (void *) maximal_core_sched_before,
-	.set_weight		= maximal_set_weight,
+	.set_weight		= (void *) maximal_set_weight,
-	.set_cpumask		= maximal_set_cpumask,
+	.set_cpumask		= (void *) maximal_set_cpumask,
-	.update_idle		= maximal_update_idle,
+	.update_idle		= (void *) maximal_update_idle,
-	.cpu_acquire		= maximal_cpu_acquire,
+	.cpu_acquire		= (void *) maximal_cpu_acquire,
-	.cpu_release		= maximal_cpu_release,
+	.cpu_release		= (void *) maximal_cpu_release,
-	.cpu_online		= maximal_cpu_online,
+	.cpu_online		= (void *) maximal_cpu_online,
-	.cpu_offline		= maximal_cpu_offline,
+	.cpu_offline		= (void *) maximal_cpu_offline,
-	.init_task		= maximal_init_task,
+	.init_task		= (void *) maximal_init_task,
-	.enable			= maximal_enable,
+	.enable			= (void *) maximal_enable,
-	.exit_task		= maximal_exit_task,
+	.exit_task		= (void *) maximal_exit_task,
-	.disable		= maximal_disable,
+	.disable		= (void *) maximal_disable,
-	.cgroup_init		= maximal_cgroup_init,
+	.cgroup_init		= (void *) maximal_cgroup_init,
-	.cgroup_exit		= maximal_cgroup_exit,
+	.cgroup_exit		= (void *) maximal_cgroup_exit,
-	.cgroup_prep_move	= maximal_cgroup_prep_move,
+	.cgroup_prep_move	= (void *) maximal_cgroup_prep_move,
-	.cgroup_move		= maximal_cgroup_move,
+	.cgroup_move		= (void *) maximal_cgroup_move,
-	.cgroup_cancel_move	= maximal_cgroup_cancel_move,
+	.cgroup_cancel_move	= (void *) maximal_cgroup_cancel_move,
-	.cgroup_set_weight	= maximal_cgroup_set_weight,
+	.cgroup_set_weight	= (void *) maximal_cgroup_set_weight,
-	.init			= maximal_init,
+	.init			= (void *) maximal_init,
-	.exit			= maximal_exit,
+	.exit			= (void *) maximal_exit,
 	.name			= "maximal",
 };
--- a/tools/testing/selftests/sched_ext/maybe_null.bpf.c
+++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c
@ -29,8 +29,8 @@ bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from,
 SEC(".struct_ops.link")
 struct sched_ext_ops maybe_null_success = {
-	.dispatch               = maybe_null_success_dispatch,
+	.dispatch               = (void *) maybe_null_success_dispatch,
-	.yield			= maybe_null_success_yield,
+	.yield			= (void *) maybe_null_success_yield,
-	.enable			= maybe_null_running,
+	.enable			= (void *) maybe_null_running,
 	.name			= "minimal",
 };
--- a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
@ -19,7 +19,7 @@ void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)
 SEC(".struct_ops.link")
 struct sched_ext_ops maybe_null_fail = {
-	.dispatch               = maybe_null_fail_dispatch,
+	.dispatch               = (void *) maybe_null_fail_dispatch,
-	.enable			= maybe_null_running,
+	.enable			= (void *) maybe_null_running,
 	.name			= "maybe_null_fail_dispatch",
 };
--- a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
+++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
@ -22,7 +22,7 @@ bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from,
 SEC(".struct_ops.link")
 struct sched_ext_ops maybe_null_fail = {
-	.yield			= maybe_null_fail_yield,
+	.yield			= (void *) maybe_null_fail_yield,
-	.enable			= maybe_null_running,
+	.enable			= (void *) maybe_null_running,
 	.name			= "maybe_null_fail_yield",
 };
--- a/tools/testing/selftests/sched_ext/prog_run.bpf.c
+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c
@ -28,6 +28,6 @@ void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops prog_run_ops = {
-	.exit			= prog_run_exit,
+	.exit			= (void *) prog_run_exit,
 	.name			= "prog_run",
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
@ -35,6 +35,6 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dfl_ops = {
-	.enqueue		= select_cpu_dfl_enqueue,
+	.enqueue		= (void *) select_cpu_dfl_enqueue,
 	.name			= "select_cpu_dfl",
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
@ -82,8 +82,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
-	.select_cpu		= select_cpu_dfl_nodispatch_select_cpu,
+	.select_cpu		= (void *) select_cpu_dfl_nodispatch_select_cpu,
-	.enqueue		= select_cpu_dfl_nodispatch_enqueue,
+	.enqueue		= (void *) select_cpu_dfl_nodispatch_enqueue,
-	.init_task		= select_cpu_dfl_nodispatch_init_task,
+	.init_task		= (void *) select_cpu_dfl_nodispatch_init_task,
 	.name			= "select_cpu_dfl_nodispatch",
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
@ -35,7 +35,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dispatch_ops = {
-	.select_cpu		= select_cpu_dispatch_select_cpu,
+	.select_cpu		= (void *) select_cpu_dispatch_select_cpu,
 	.name			= "select_cpu_dispatch",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
@ -30,8 +30,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
-	.select_cpu		= select_cpu_dispatch_bad_dsq_select_cpu,
+	.select_cpu		= (void *) select_cpu_dispatch_bad_dsq_select_cpu,
-	.exit			= select_cpu_dispatch_bad_dsq_exit,
+	.exit			= (void *) select_cpu_dispatch_bad_dsq_exit,
 	.name			= "select_cpu_dispatch_bad_dsq",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
@ -31,8 +31,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
-	.select_cpu		= select_cpu_dispatch_dbl_dsp_select_cpu,
+	.select_cpu		= (void *) select_cpu_dispatch_dbl_dsp_select_cpu,
-	.exit			= select_cpu_dispatch_dbl_dsp_exit,
+	.exit			= (void *) select_cpu_dispatch_dbl_dsp_exit,
 	.name			= "select_cpu_dispatch_dbl_dsp",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@ -81,12 +81,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
 SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_vtime_ops = {
-	.select_cpu		= select_cpu_vtime_select_cpu,
+	.select_cpu		= (void *) select_cpu_vtime_select_cpu,
-	.dispatch		= select_cpu_vtime_dispatch,
+	.dispatch		= (void *) select_cpu_vtime_dispatch,
-	.running		= select_cpu_vtime_running,
+	.running		= (void *) select_cpu_vtime_running,
-	.stopping		= select_cpu_vtime_stopping,
+	.stopping		= (void *) select_cpu_vtime_stopping,
-	.enable			= select_cpu_vtime_enable,
+	.enable			= (void *) select_cpu_vtime_enable,
-	.init			= select_cpu_vtime_init,
+	.init			= (void *) select_cpu_vtime_init,
 	.name			= "select_cpu_vtime",
 	.timeout_ms		= 1000U,
 };
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@ -1522,6 +1522,45 @@ static bool test_copy_vma(void)
 	return true;
 }
 static bool test_expand_only_mode(void)
 {
 	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vm_area_struct *vma_prev, *vma;
 	VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5);
 	/*
 	 * Place a VMA prior to the one we're expanding so we assert that we do
 	 * not erroneously try to traverse to the previous VMA even though we
 	 * have, through the use of VMG_FLAG_JUST_EXPAND, indicated we do not
 	 * need to do so.
 	 */
 	alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
 	/*
 	 * We will be positioned at the prev VMA, but looking to expand to
 	 * 0x9000.
 	 */
 	vma_iter_set(&vmi, 0x3000);
 	vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
 	vmg.prev = vma_prev;
 	vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
 	vma = vma_merge_new_range(&vmg);
 	ASSERT_NE(vma, NULL);
 	ASSERT_EQ(vma, vma_prev);
 	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma->vm_start, 0x3000);
 	ASSERT_EQ(vma->vm_end, 0x9000);
 	ASSERT_EQ(vma->vm_pgoff, 3);
 	ASSERT_TRUE(vma_write_started(vma));
 	ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
 	cleanup_mm(&mm, &vmi);
 	return true;
 }
 int main(void)
 {
 	int num_tests = 0, num_fail = 0;
@ -1553,6 +1592,7 @@ int main(void)
 	TEST(vmi_prealloc_fail);
 	TEST(merge_extend);
 	TEST(copy_vma);
 	TEST(expand_only_mode);
 #undef TEST