mirror of
https://github.com/torvalds/linux.git
synced 2024-11-01 04:53:36 +01:00
Compare commits
45 commits
81c3570a29
...
0867eaca3d
Author | SHA1 | Date | |
---|---|---|---|
|
0867eaca3d | ||
|
4236f91380 | ||
|
c1e939a21e | ||
|
daa9f66fe1 | ||
|
7fbaacafbc | ||
|
9251e3e93c | ||
|
d5b2ee0fe8 | ||
|
704573851b | ||
|
01626a1823 | ||
|
1834300798 | ||
|
58a039e679 | ||
|
f2330b650e | ||
|
3673167a3a | ||
|
5bb1f4c934 | ||
|
e8133a7799 | ||
|
c4d91e225f | ||
|
b125a0def2 | ||
|
41e192ad27 | ||
|
d95fb348f0 | ||
|
bc0a2f3a73 | ||
|
d949d1d14f | ||
|
14611508cb | ||
|
1db272864f | ||
|
b7c5f9a1fb | ||
|
79f3d123ca | ||
|
281dd25c1a | ||
|
985da552a9 | ||
|
f64e67e5d3 | ||
|
7c18d48110 | ||
|
df745e2509 | ||
|
cc7d859434 | ||
|
d658d59471 | ||
|
c31f2ee5cd | ||
|
7724abf0ca | ||
|
0e7ffff1b8 | ||
|
cb7e509c4e | ||
|
d28d17a845 | ||
|
895669fd0d | ||
|
4f7f417042 | ||
|
9b3c11a867 | ||
|
2b059d0d1e | ||
|
3cc4e13bb1 | ||
|
117932eea9 | ||
|
8a1cb53e6c | ||
|
70c9904c29 |
57 changed files with 595 additions and 383 deletions
|
@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime.
|
|||
A similar feature already exists in the XNU kernel with the
|
||||
VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].
|
||||
|
||||
User API
|
||||
========
|
||||
mseal()
|
||||
-----------
|
||||
The mseal() syscall has the following signature:
|
||||
SYSCALL
|
||||
=======
|
||||
mseal syscall signature
|
||||
-----------------------
|
||||
``int mseal(void \* addr, size_t len, unsigned long flags)``
|
||||
|
||||
``int mseal(void addr, size_t len, unsigned long flags)``
|
||||
**addr**/**len**: virtual memory address range.
|
||||
The address range set by **addr**/**len** must meet:
|
||||
- The start address must be in an allocated VMA.
|
||||
- The start address must be page aligned.
|
||||
- The end address (**addr** + **len**) must be in an allocated VMA.
|
||||
- no gap (unallocated memory) between start and end address.
|
||||
|
||||
**addr/len**: virtual memory address range.
|
||||
The ``len`` will be paged aligned implicitly by the kernel.
|
||||
|
||||
The address range set by ``addr``/``len`` must meet:
|
||||
- The start address must be in an allocated VMA.
|
||||
- The start address must be page aligned.
|
||||
- The end address (``addr`` + ``len``) must be in an allocated VMA.
|
||||
- no gap (unallocated memory) between start and end address.
|
||||
**flags**: reserved for future use.
|
||||
|
||||
The ``len`` will be paged aligned implicitly by the kernel.
|
||||
**Return values**:
|
||||
- **0**: Success.
|
||||
- **-EINVAL**:
|
||||
* Invalid input ``flags``.
|
||||
* The start address (``addr``) is not page aligned.
|
||||
* Address range (``addr`` + ``len``) overflow.
|
||||
- **-ENOMEM**:
|
||||
* The start address (``addr``) is not allocated.
|
||||
* The end address (``addr`` + ``len``) is not allocated.
|
||||
* A gap (unallocated memory) between start and end address.
|
||||
- **-EPERM**:
|
||||
* sealing is supported only on 64-bit CPUs, 32-bit is not supported.
|
||||
|
||||
**flags**: reserved for future use.
|
||||
**Note about error return**:
|
||||
- For above error cases, users can expect the given memory range is
|
||||
unmodified, i.e. no partial update.
|
||||
- There might be other internal errors/cases not listed here, e.g.
|
||||
error during merging/splitting VMAs, or the process reaching the maximum
|
||||
number of supported VMAs. In those cases, partial updates to the given
|
||||
memory range could happen. However, those cases should be rare.
|
||||
|
||||
**return values**:
|
||||
**Architecture support**:
|
||||
mseal only works on 64-bit CPUs, not 32-bit CPUs.
|
||||
|
||||
- ``0``: Success.
|
||||
**Idempotent**:
|
||||
users can call mseal multiple times. mseal on an already sealed memory
|
||||
is a no-action (not error).
|
||||
|
||||
- ``-EINVAL``:
|
||||
- Invalid input ``flags``.
|
||||
- The start address (``addr``) is not page aligned.
|
||||
- Address range (``addr`` + ``len``) overflow.
|
||||
**no munseal**
|
||||
Once mapping is sealed, it can't be unsealed. The kernel should never
|
||||
have munseal, this is consistent with other sealing feature, e.g.
|
||||
F_SEAL_SEAL for file.
|
||||
|
||||
- ``-ENOMEM``:
|
||||
- The start address (``addr``) is not allocated.
|
||||
- The end address (``addr`` + ``len``) is not allocated.
|
||||
- A gap (unallocated memory) between start and end address.
|
||||
Blocked mm syscall for sealed mapping
|
||||
-------------------------------------
|
||||
It might be important to note: **once the mapping is sealed, it will
|
||||
stay in the process's memory until the process terminates**.
|
||||
|
||||
- ``-EPERM``:
|
||||
- sealing is supported only on 64-bit CPUs, 32-bit is not supported.
|
||||
Example::
|
||||
|
||||
- For above error cases, users can expect the given memory range is
|
||||
unmodified, i.e. no partial update.
|
||||
*ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
|
||||
rc = mseal(ptr, 4096, 0);
|
||||
/* munmap will fail */
|
||||
rc = munmap(ptr, 4096);
|
||||
assert(rc < 0);
|
||||
|
||||
- There might be other internal errors/cases not listed here, e.g.
|
||||
error during merging/splitting VMAs, or the process reaching the max
|
||||
number of supported VMAs. In those cases, partial updates to the given
|
||||
memory range could happen. However, those cases should be rare.
|
||||
Blocked mm syscall:
|
||||
- munmap
|
||||
- mmap
|
||||
- mremap
|
||||
- mprotect and pkey_mprotect
|
||||
- some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE,
|
||||
MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK
|
||||
|
||||
**Blocked operations after sealing**:
|
||||
Unmapping, moving to another location, and shrinking the size,
|
||||
via munmap() and mremap(), can leave an empty space, therefore
|
||||
can be replaced with a VMA with a new set of attributes.
|
||||
The first set of syscalls to block is munmap, mremap, mmap. They can
|
||||
either leave an empty space in the address space, therefore allowing
|
||||
replacement with a new mapping with new set of attributes, or can
|
||||
overwrite the existing mapping with another mapping.
|
||||
|
||||
Moving or expanding a different VMA into the current location,
|
||||
via mremap().
|
||||
mprotect and pkey_mprotect are blocked because they changes the
|
||||
protection bits (RWX) of the mapping.
|
||||
|
||||
Modifying a VMA via mmap(MAP_FIXED).
|
||||
Certain destructive madvise behaviors, specifically MADV_DONTNEED,
|
||||
MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce
|
||||
risks when applied to anonymous memory by threads lacking write
|
||||
permissions. Consequently, these operations are prohibited under such
|
||||
conditions. The aforementioned behaviors have the potential to modify
|
||||
region contents by discarding pages, effectively performing a memset(0)
|
||||
operation on the anonymous memory.
|
||||
|
||||
Size expansion, via mremap(), does not appear to pose any
|
||||
specific risks to sealed VMAs. It is included anyway because
|
||||
the use case is unclear. In any case, users can rely on
|
||||
merging to expand a sealed VMA.
|
||||
Kernel will return -EPERM for blocked syscalls.
|
||||
|
||||
mprotect() and pkey_mprotect().
|
||||
When blocked syscall return -EPERM due to sealing, the memory regions may
|
||||
or may not be changed, depends on the syscall being blocked:
|
||||
|
||||
Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
|
||||
for anonymous memory, when users don't have write permission to the
|
||||
memory. Those behaviors can alter region contents by discarding pages,
|
||||
effectively a memset(0) for anonymous memory.
|
||||
- munmap: munmap is atomic. If one of VMAs in the given range is
|
||||
sealed, none of VMAs are updated.
|
||||
- mprotect, pkey_mprotect, madvise: partial update might happen, e.g.
|
||||
when mprotect over multiple VMAs, mprotect might update the beginning
|
||||
VMAs before reaching the sealed VMA and return -EPERM.
|
||||
- mmap and mremap: undefined behavior.
|
||||
|
||||
Kernel will return -EPERM for blocked operations.
|
||||
|
||||
For blocked operations, one can expect the given address is unmodified,
|
||||
i.e. no partial update. Note, this is different from existing mm
|
||||
system call behaviors, where partial updates are made till an error is
|
||||
found and returned to userspace. To give an example:
|
||||
|
||||
Assume following code sequence:
|
||||
|
||||
- ptr = mmap(null, 8192, PROT_NONE);
|
||||
- munmap(ptr + 4096, 4096);
|
||||
- ret1 = mprotect(ptr, 8192, PROT_READ);
|
||||
- mseal(ptr, 4096);
|
||||
- ret2 = mprotect(ptr, 8192, PROT_NONE);
|
||||
|
||||
ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
|
||||
|
||||
ret2 will be -EPERM, the page remains to be PROT_READ.
|
||||
|
||||
**Note**:
|
||||
|
||||
- mseal() only works on 64-bit CPUs, not 32-bit CPU.
|
||||
|
||||
- users can call mseal() multiple times, mseal() on an already sealed memory
|
||||
is a no-action (not error).
|
||||
|
||||
- munseal() is not supported.
|
||||
|
||||
Use cases:
|
||||
==========
|
||||
Use cases
|
||||
=========
|
||||
- glibc:
|
||||
The dynamic linker, during loading ELF executables, can apply sealing to
|
||||
non-writable memory segments.
|
||||
mapping segments.
|
||||
|
||||
- Chrome browser: protect some security sensitive data-structures.
|
||||
- Chrome browser: protect some security sensitive data structures.
|
||||
|
||||
Notes on which memory to seal:
|
||||
==============================
|
||||
|
||||
It might be important to note that sealing changes the lifetime of a mapping,
|
||||
i.e. the sealed mapping won’t be unmapped till the process terminates or the
|
||||
exec system call is invoked. Applications can apply sealing to any virtual
|
||||
memory region from userspace, but it is crucial to thoroughly analyze the
|
||||
mapping's lifetime prior to apply the sealing.
|
||||
When not to use mseal
|
||||
=====================
|
||||
Applications can apply sealing to any virtual memory region from userspace,
|
||||
but it is *crucial to thoroughly analyze the mapping's lifetime* prior to
|
||||
apply the sealing. This is because the sealed mapping *won’t be unmapped*
|
||||
until the process terminates or the exec system call is invoked.
|
||||
|
||||
For example:
|
||||
- aio/shm
|
||||
aio/shm can call mmap and munmap on behalf of userspace, e.g.
|
||||
ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to
|
||||
the lifetime of the process. If those memories are sealed from userspace,
|
||||
then munmap will fail, causing leaks in VMA address space during the
|
||||
lifetime of the process.
|
||||
|
||||
- aio/shm
|
||||
- ptr allocated by malloc (heap)
|
||||
Don't use mseal on the memory ptr return from malloc().
|
||||
malloc() is implemented by allocator, e.g. by glibc. Heap manager might
|
||||
allocate a ptr from brk or mapping created by mmap.
|
||||
If an app calls mseal on a ptr returned from malloc(), this can affect
|
||||
the heap manager's ability to manage the mappings; the outcome is
|
||||
non-deterministic.
|
||||
|
||||
aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
|
||||
shm.c. The lifetime of those mapping are not tied to the lifetime of the
|
||||
process. If those memories are sealed from userspace, then munmap() will fail,
|
||||
causing leaks in VMA address space during the lifetime of the process.
|
||||
Example::
|
||||
|
||||
- Brk (heap)
|
||||
ptr = malloc(size);
|
||||
/* don't call mseal on ptr return from malloc. */
|
||||
mseal(ptr, size);
|
||||
/* free will success, allocator can't shrink heap lower than ptr */
|
||||
free(ptr);
|
||||
|
||||
Currently, userspace applications can seal parts of the heap by calling
|
||||
malloc() and mseal().
|
||||
let's assume following calls from user space:
|
||||
mseal doesn't block
|
||||
===================
|
||||
In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's
|
||||
attributes, such as protection bits (RWX). Sealed mappings doesn't mean the
|
||||
memory is immutable.
|
||||
|
||||
- ptr = malloc(size);
|
||||
- mprotect(ptr, size, RO);
|
||||
- mseal(ptr, size);
|
||||
- free(ptr);
|
||||
|
||||
Technically, before mseal() is added, the user can change the protection of
|
||||
the heap by calling mprotect(RO). As long as the user changes the protection
|
||||
back to RW before free(), the memory range can be reused.
|
||||
|
||||
Adding mseal() into the picture, however, the heap is then sealed partially,
|
||||
the user can still free it, but the memory remains to be RO. If the address
|
||||
is re-used by the heap manager for another malloc, the process might crash
|
||||
soon after. Therefore, it is important not to apply sealing to any memory
|
||||
that might get recycled.
|
||||
|
||||
Furthermore, even if the application never calls the free() for the ptr,
|
||||
the heap manager may invoke the brk system call to shrink the size of the
|
||||
heap. In the kernel, the brk-shrink will call munmap(). Consequently,
|
||||
depending on the location of the ptr, the outcome of brk-shrink is
|
||||
nondeterministic.
|
||||
|
||||
|
||||
Additional notes:
|
||||
=================
|
||||
As Jann Horn pointed out in [3], there are still a few ways to write
|
||||
to RO memory, which is, in a way, by design. Those cases are not covered
|
||||
by mseal(). If applications want to block such cases, sandbox tools (such as
|
||||
seccomp, LSM, etc) might be considered.
|
||||
to RO memory, which is, in a way, by design. And those could be blocked
|
||||
by different security measures.
|
||||
|
||||
Those cases are:
|
||||
|
||||
- Write to read-only memory through /proc/self/mem interface.
|
||||
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
|
||||
- userfaultfd.
|
||||
- Write to read-only memory through /proc/self/mem interface (FOLL_FORCE).
|
||||
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
|
||||
- userfaultfd.
|
||||
|
||||
The idea that inspired this patch comes from Stephen Röttger’s work in V8
|
||||
CFI [4]. Chrome browser in ChromeOS will be the first user of this API.
|
||||
|
||||
Reference:
|
||||
==========
|
||||
[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
|
||||
|
||||
[2] https://man.openbsd.org/mimmutable.2
|
||||
|
||||
[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
|
||||
|
||||
[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
|
||||
Reference
|
||||
=========
|
||||
- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
|
||||
- [2] https://man.openbsd.org/mimmutable.2
|
||||
- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
|
||||
- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
|
||||
|
|
|
@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs)
|
|||
int ud_type;
|
||||
u32 imm;
|
||||
|
||||
/*
|
||||
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
|
||||
* is a rare case that uses @regs without passing them to
|
||||
* irqentry_enter().
|
||||
*/
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
ud_type = decode_bug(regs->ip, &imm);
|
||||
if (ud_type == BUG_NONE)
|
||||
return handled;
|
||||
|
@ -275,6 +269,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
|
|||
* All lies, just get the WARN/BUG out.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
/*
|
||||
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
|
||||
* is a rare case that uses @regs without passing them to
|
||||
* irqentry_enter().
|
||||
*/
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
/*
|
||||
* Since we're emulating a CALL with exceptions, restore the interrupt
|
||||
* state to what it was at the exception site.
|
||||
|
|
|
@ -674,6 +674,16 @@ EXPORT_SYMBOL_GPL(tpm_chip_register);
|
|||
*/
|
||||
void tpm_chip_unregister(struct tpm_chip *chip)
|
||||
{
|
||||
#ifdef CONFIG_TCG_TPM2_HMAC
|
||||
int rc;
|
||||
|
||||
rc = tpm_try_get_ops(chip);
|
||||
if (!rc) {
|
||||
tpm2_end_auth_session(chip);
|
||||
tpm_put_ops(chip);
|
||||
}
|
||||
#endif
|
||||
|
||||
tpm_del_legacy_sysfs(chip);
|
||||
if (tpm_is_hwrng_enabled(chip))
|
||||
hwrng_unregister(&chip->hwrng);
|
||||
|
|
|
@ -27,6 +27,9 @@ static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space,
|
|||
struct tpm_header *header = (void *)buf;
|
||||
ssize_t ret, len;
|
||||
|
||||
if (chip->flags & TPM_CHIP_FLAG_TPM2)
|
||||
tpm2_end_auth_session(chip);
|
||||
|
||||
ret = tpm2_prepare_space(chip, space, buf, bufsiz);
|
||||
/* If the command is not implemented by the TPM, synthesize a
|
||||
* response with a TPM2_RC_COMMAND_CODE return for user-space.
|
||||
|
|
|
@ -379,10 +379,12 @@ int tpm_pm_suspend(struct device *dev)
|
|||
|
||||
rc = tpm_try_get_ops(chip);
|
||||
if (!rc) {
|
||||
if (chip->flags & TPM_CHIP_FLAG_TPM2)
|
||||
if (chip->flags & TPM_CHIP_FLAG_TPM2) {
|
||||
tpm2_end_auth_session(chip);
|
||||
tpm2_shutdown(chip, TPM2_SU_STATE);
|
||||
else
|
||||
} else {
|
||||
rc = tpm1_pm_suspend(chip, tpm_suspend_pcr);
|
||||
}
|
||||
|
||||
tpm_put_ops(chip);
|
||||
}
|
||||
|
|
|
@ -333,6 +333,9 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
|
|||
}
|
||||
|
||||
#ifdef CONFIG_TCG_TPM2_HMAC
|
||||
/* The first write to /dev/tpm{rm0} will flush the session. */
|
||||
attributes |= TPM2_SA_CONTINUE_SESSION;
|
||||
|
||||
/*
|
||||
* The Architecture Guide requires us to strip trailing zeros
|
||||
* before computing the HMAC
|
||||
|
@ -484,7 +487,8 @@ static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v,
|
|||
sha256_final(&sctx, out);
|
||||
}
|
||||
|
||||
static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
|
||||
static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip,
|
||||
struct tpm2_auth *auth)
|
||||
{
|
||||
struct crypto_kpp *kpp;
|
||||
struct kpp_request *req;
|
||||
|
@ -543,7 +547,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
|
|||
sg_set_buf(&s[0], chip->null_ec_key_x, EC_PT_SZ);
|
||||
sg_set_buf(&s[1], chip->null_ec_key_y, EC_PT_SZ);
|
||||
kpp_request_set_input(req, s, EC_PT_SZ*2);
|
||||
sg_init_one(d, chip->auth->salt, EC_PT_SZ);
|
||||
sg_init_one(d, auth->salt, EC_PT_SZ);
|
||||
kpp_request_set_output(req, d, EC_PT_SZ);
|
||||
crypto_kpp_compute_shared_secret(req);
|
||||
kpp_request_free(req);
|
||||
|
@ -554,8 +558,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
|
|||
* This works because KDFe fully consumes the secret before it
|
||||
* writes the salt
|
||||
*/
|
||||
tpm2_KDFe(chip->auth->salt, "SECRET", x, chip->null_ec_key_x,
|
||||
chip->auth->salt);
|
||||
tpm2_KDFe(auth->salt, "SECRET", x, chip->null_ec_key_x, auth->salt);
|
||||
|
||||
out:
|
||||
crypto_free_kpp(kpp);
|
||||
|
@ -853,7 +856,9 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
|
|||
if (rc)
|
||||
/* manually close the session if it wasn't consumed */
|
||||
tpm2_flush_context(chip, auth->handle);
|
||||
memzero_explicit(auth, sizeof(*auth));
|
||||
|
||||
kfree_sensitive(auth);
|
||||
chip->auth = NULL;
|
||||
} else {
|
||||
/* reset for next use */
|
||||
auth->session = TPM_HEADER_SIZE;
|
||||
|
@ -881,7 +886,8 @@ void tpm2_end_auth_session(struct tpm_chip *chip)
|
|||
return;
|
||||
|
||||
tpm2_flush_context(chip, auth->handle);
|
||||
memzero_explicit(auth, sizeof(*auth));
|
||||
kfree_sensitive(auth);
|
||||
chip->auth = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(tpm2_end_auth_session);
|
||||
|
||||
|
@ -915,33 +921,37 @@ static int tpm2_parse_start_auth_session(struct tpm2_auth *auth,
|
|||
|
||||
static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
|
||||
{
|
||||
int rc;
|
||||
unsigned int offset = 0; /* dummy offset for null seed context */
|
||||
u8 name[SHA256_DIGEST_SIZE + 2];
|
||||
u32 tmp_null_key;
|
||||
int rc;
|
||||
|
||||
rc = tpm2_load_context(chip, chip->null_key_context, &offset,
|
||||
null_key);
|
||||
if (rc != -EINVAL)
|
||||
return rc;
|
||||
&tmp_null_key);
|
||||
if (rc != -EINVAL) {
|
||||
if (!rc)
|
||||
*null_key = tmp_null_key;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* an integrity failure may mean the TPM has been reset */
|
||||
dev_err(&chip->dev, "NULL key integrity failure!\n");
|
||||
/* check the null name against what we know */
|
||||
tpm2_create_primary(chip, TPM2_RH_NULL, NULL, name);
|
||||
if (memcmp(name, chip->null_key_name, sizeof(name)) == 0)
|
||||
/* name unchanged, assume transient integrity failure */
|
||||
return rc;
|
||||
/*
|
||||
* Fatal TPM failure: the NULL seed has actually changed, so
|
||||
* the TPM must have been illegally reset. All in-kernel TPM
|
||||
* operations will fail because the NULL primary can't be
|
||||
* loaded to salt the sessions, but disable the TPM anyway so
|
||||
* userspace programmes can't be compromised by it.
|
||||
*/
|
||||
dev_err(&chip->dev, "NULL name has changed, disabling TPM due to interference\n");
|
||||
/* Try to re-create null key, given the integrity failure: */
|
||||
rc = tpm2_create_primary(chip, TPM2_RH_NULL, &tmp_null_key, name);
|
||||
if (rc)
|
||||
goto err;
|
||||
|
||||
/* Return null key if the name has not been changed: */
|
||||
if (!memcmp(name, chip->null_key_name, sizeof(name))) {
|
||||
*null_key = tmp_null_key;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Deduce from the name change TPM interference: */
|
||||
dev_err(&chip->dev, "null key integrity check failed\n");
|
||||
tpm2_flush_context(chip, tmp_null_key);
|
||||
chip->flags |= TPM_CHIP_FLAG_DISABLE;
|
||||
|
||||
return rc;
|
||||
err:
|
||||
return rc ? -ENODEV : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -958,16 +968,20 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
|
|||
*/
|
||||
int tpm2_start_auth_session(struct tpm_chip *chip)
|
||||
{
|
||||
struct tpm2_auth *auth;
|
||||
struct tpm_buf buf;
|
||||
struct tpm2_auth *auth = chip->auth;
|
||||
int rc;
|
||||
u32 null_key;
|
||||
int rc;
|
||||
|
||||
if (!auth) {
|
||||
dev_warn_once(&chip->dev, "auth session is not active\n");
|
||||
if (chip->auth) {
|
||||
dev_warn_once(&chip->dev, "auth session is active\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
auth = kzalloc(sizeof(*auth), GFP_KERNEL);
|
||||
if (!auth)
|
||||
return -ENOMEM;
|
||||
|
||||
rc = tpm2_load_null(chip, &null_key);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
@ -988,7 +1002,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
|
|||
tpm_buf_append(&buf, auth->our_nonce, sizeof(auth->our_nonce));
|
||||
|
||||
/* append encrypted salt and squirrel away unencrypted in auth */
|
||||
tpm_buf_append_salt(&buf, chip);
|
||||
tpm_buf_append_salt(&buf, chip, auth);
|
||||
/* session type (HMAC, audit or policy) */
|
||||
tpm_buf_append_u8(&buf, TPM2_SE_HMAC);
|
||||
|
||||
|
@ -1010,10 +1024,13 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
|
|||
|
||||
tpm_buf_destroy(&buf);
|
||||
|
||||
if (rc)
|
||||
goto out;
|
||||
if (rc == TPM2_RC_SUCCESS) {
|
||||
chip->auth = auth;
|
||||
return 0;
|
||||
}
|
||||
|
||||
out:
|
||||
out:
|
||||
kfree_sensitive(auth);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL(tpm2_start_auth_session);
|
||||
|
@ -1347,18 +1364,21 @@ static int tpm2_create_null_primary(struct tpm_chip *chip)
|
|||
*
|
||||
* Derive and context save the null primary and allocate memory in the
|
||||
* struct tpm_chip for the authorizations.
|
||||
*
|
||||
* Return:
|
||||
* * 0 - OK
|
||||
* * -errno - A system error
|
||||
* * TPM_RC - A TPM error
|
||||
*/
|
||||
int tpm2_sessions_init(struct tpm_chip *chip)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = tpm2_create_null_primary(chip);
|
||||
if (rc)
|
||||
dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
|
||||
|
||||
chip->auth = kmalloc(sizeof(*chip->auth), GFP_KERNEL);
|
||||
if (!chip->auth)
|
||||
return -ENOMEM;
|
||||
if (rc) {
|
||||
dev_err(&chip->dev, "null key creation failed with %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -2441,6 +2441,15 @@ static const struct usb_device_id uvc_ids[] = {
|
|||
.driver_info = (kernel_ulong_t)&(const struct uvc_device_info){
|
||||
.uvc_version = 0x010a,
|
||||
} },
|
||||
{ .match_flags = USB_DEVICE_ID_MATCH_DEVICE
|
||||
| USB_DEVICE_ID_MATCH_INT_INFO,
|
||||
.idVendor = 0x0408,
|
||||
.idProduct = 0x4033,
|
||||
.bInterfaceClass = USB_CLASS_VIDEO,
|
||||
.bInterfaceSubClass = 1,
|
||||
.bInterfaceProtocol = UVC_PC_PROTOCOL_15,
|
||||
.driver_info = (kernel_ulong_t) &(const struct uvc_device_info )
|
||||
{.uvc_version = 0x010a, } },
|
||||
/* LogiLink Wireless Webcam */
|
||||
{ .match_flags = USB_DEVICE_ID_MATCH_DEVICE
|
||||
| USB_DEVICE_ID_MATCH_INT_INFO,
|
||||
|
|
|
@ -3651,7 +3651,7 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
|
|||
enum dma_data_direction dir;
|
||||
struct scsi_data_buffer *sdb = &scp->sdb;
|
||||
u8 *fsp;
|
||||
int i;
|
||||
int i, total = 0;
|
||||
|
||||
/*
|
||||
* Even though reads are inherently atomic (in this driver), we expect
|
||||
|
@ -3688,18 +3688,16 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
|
|||
fsp + (block * sdebug_sector_size),
|
||||
sdebug_sector_size, sg_skip, do_write);
|
||||
sdeb_data_sector_unlock(sip, do_write);
|
||||
if (ret != sdebug_sector_size) {
|
||||
ret += (i * sdebug_sector_size);
|
||||
total += ret;
|
||||
if (ret != sdebug_sector_size)
|
||||
break;
|
||||
}
|
||||
sg_skip += sdebug_sector_size;
|
||||
if (++block >= sdebug_store_sectors)
|
||||
block = 0;
|
||||
}
|
||||
ret = num * sdebug_sector_size;
|
||||
sdeb_data_unlock(sip, atomic);
|
||||
|
||||
return ret;
|
||||
return total;
|
||||
}
|
||||
|
||||
/* Returns number of bytes copied or -1 if error. */
|
||||
|
|
|
@ -8219,7 +8219,7 @@ static void ufshcd_update_rtc(struct ufs_hba *hba)
|
|||
|
||||
err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR, QUERY_ATTR_IDN_SECONDS_PASSED,
|
||||
0, 0, &val);
|
||||
ufshcd_rpm_put_sync(hba);
|
||||
ufshcd_rpm_put(hba);
|
||||
|
||||
if (err)
|
||||
dev_err(hba->dev, "%s: Failed to update rtc %d\n", __func__, err);
|
||||
|
|
|
@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
|
|||
|
||||
folio_clear_uptodate(folio);
|
||||
folio_clear_mappedtodisk(folio);
|
||||
folio_clear_checked(folio);
|
||||
|
||||
head = folio_buffers(folio);
|
||||
if (head) {
|
||||
|
|
|
@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
|
|||
return 0;
|
||||
|
||||
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
||||
int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
|
||||
|
||||
if (byte_start > id_count || byte_start + byte_len > id_count) {
|
||||
ret = -EINVAL;
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
|
||||
byte_start + byte_len, 0);
|
||||
if (ret) {
|
||||
|
|
|
@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
|
|||
}
|
||||
}
|
||||
|
||||
void dup_userfaultfd_fail(struct list_head *fcs)
|
||||
{
|
||||
struct userfaultfd_fork_ctx *fctx, *n;
|
||||
|
||||
/*
|
||||
* An error has occurred on fork, we will tear memory down, but have
|
||||
* allocated memory for fctx's and raised reference counts for both the
|
||||
* original and child contexts (and on the mm for each as a result).
|
||||
*
|
||||
* These would ordinarily be taken care of by a user handling the event,
|
||||
* but we are no longer doing so, so manually clean up here.
|
||||
*
|
||||
* mm tear down will take care of cleaning up VMA contexts.
|
||||
*/
|
||||
list_for_each_entry_safe(fctx, n, fcs, list) {
|
||||
struct userfaultfd_ctx *octx = fctx->orig;
|
||||
struct userfaultfd_ctx *ctx = fctx->new;
|
||||
|
||||
atomic_dec(&octx->mmap_changing);
|
||||
VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
|
||||
userfaultfd_ctx_put(octx);
|
||||
userfaultfd_ctx_put(ctx);
|
||||
|
||||
list_del(&fctx->list);
|
||||
kfree(fctx);
|
||||
}
|
||||
}
|
||||
|
||||
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
||||
struct vm_userfaultfd_ctx *vm_ctx)
|
||||
{
|
||||
|
|
|
@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
|
|||
return atomic_long_read(&mm->ksm_zero_pages);
|
||||
}
|
||||
|
||||
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
{
|
||||
/* Adding mm to ksm is best effort on fork. */
|
||||
if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
|
||||
return __ksm_enter(mm);
|
||||
|
||||
return 0;
|
||||
__ksm_enter(mm);
|
||||
}
|
||||
|
||||
static inline int ksm_execve(struct mm_struct *mm)
|
||||
|
@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int ksm_execve(struct mm_struct *mm)
|
||||
|
|
|
@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
|
|||
|
||||
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
|
||||
extern void dup_userfaultfd_complete(struct list_head *);
|
||||
void dup_userfaultfd_fail(struct list_head *);
|
||||
|
||||
extern void mremap_userfaultfd_prep(struct vm_area_struct *,
|
||||
struct vm_userfaultfd_ctx *);
|
||||
|
@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
|
|||
{
|
||||
}
|
||||
|
||||
static inline void dup_userfaultfd_fail(struct list_head *l)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
||||
struct vm_userfaultfd_ctx *ctx)
|
||||
{
|
||||
|
|
|
@ -24,6 +24,23 @@
|
|||
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
|
||||
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
|
||||
|
||||
/*
|
||||
* cgroup bpf destruction makes heavy use of work items and there can be a lot
|
||||
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
|
||||
* destruction work items don't end up filling up max_active of system_wq
|
||||
* which may lead to deadlock.
|
||||
*/
|
||||
static struct workqueue_struct *cgroup_bpf_destroy_wq;
|
||||
|
||||
static int __init cgroup_bpf_wq_init(void)
|
||||
{
|
||||
cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
|
||||
if (!cgroup_bpf_destroy_wq)
|
||||
panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
|
||||
return 0;
|
||||
}
|
||||
core_initcall(cgroup_bpf_wq_init);
|
||||
|
||||
/* __always_inline is necessary to prevent indirect call through run_prog
|
||||
* function pointer.
|
||||
*/
|
||||
|
@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
|
|||
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
|
||||
|
||||
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
|
||||
queue_work(system_wq, &cgrp->bpf.release_work);
|
||||
queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
|
||||
}
|
||||
|
||||
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
|
||||
|
|
|
@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
|
|||
{
|
||||
struct cgroup *cgroup;
|
||||
int ret = false;
|
||||
int level = 1;
|
||||
int level = 0;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
|
@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
|
|||
if (cgroup->nr_descendants >= cgroup->max_descendants)
|
||||
goto fail;
|
||||
|
||||
if (level > cgroup->max_depth)
|
||||
if (level >= cgroup->max_depth)
|
||||
goto fail;
|
||||
|
||||
level++;
|
||||
|
|
|
@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
mm->exec_vm = oldmm->exec_vm;
|
||||
mm->stack_vm = oldmm->stack_vm;
|
||||
|
||||
retval = ksm_fork(mm, oldmm);
|
||||
if (retval)
|
||||
goto out;
|
||||
khugepaged_fork(mm, oldmm);
|
||||
|
||||
/* Use __mt_dup() to efficiently build an identical maple tree. */
|
||||
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
|
||||
if (unlikely(retval))
|
||||
|
@ -760,6 +755,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
vma_iter_free(&vmi);
|
||||
if (!retval) {
|
||||
mt_set_in_rcu(vmi.mas.tree);
|
||||
ksm_fork(mm, oldmm);
|
||||
khugepaged_fork(mm, oldmm);
|
||||
} else if (mpnt) {
|
||||
/*
|
||||
* The entire maple tree has already been duplicated. If the
|
||||
|
@ -775,7 +772,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
mmap_write_unlock(mm);
|
||||
flush_tlb_mm(oldmm);
|
||||
mmap_write_unlock(oldmm);
|
||||
dup_userfaultfd_complete(&uf);
|
||||
if (!retval)
|
||||
dup_userfaultfd_complete(&uf);
|
||||
else
|
||||
dup_userfaultfd_fail(&uf);
|
||||
fail_uprobe_end:
|
||||
uprobe_end_dup_mmap();
|
||||
return retval;
|
||||
|
|
|
@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
|
|||
rams_size += 16;
|
||||
}
|
||||
|
||||
rams[i].start = res.start;
|
||||
rams[i++].end = res.end;
|
||||
|
||||
rams[i++] = res;
|
||||
start = res.end + 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
|
|||
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
|
||||
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
|
||||
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
|
||||
static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
|
||||
static int scx_ops_bypass_depth;
|
||||
static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
|
||||
static bool scx_ops_init_task_enabled;
|
||||
static bool scx_switching_all;
|
||||
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
|
||||
|
@ -4298,18 +4299,20 @@ bool task_should_scx(struct task_struct *p)
|
|||
*/
|
||||
static void scx_ops_bypass(bool bypass)
|
||||
{
|
||||
int depth, cpu;
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
|
||||
if (bypass) {
|
||||
depth = atomic_inc_return(&scx_ops_bypass_depth);
|
||||
WARN_ON_ONCE(depth <= 0);
|
||||
if (depth != 1)
|
||||
return;
|
||||
scx_ops_bypass_depth++;
|
||||
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
|
||||
if (scx_ops_bypass_depth != 1)
|
||||
goto unlock;
|
||||
} else {
|
||||
depth = atomic_dec_return(&scx_ops_bypass_depth);
|
||||
WARN_ON_ONCE(depth < 0);
|
||||
if (depth != 0)
|
||||
return;
|
||||
scx_ops_bypass_depth--;
|
||||
WARN_ON_ONCE(scx_ops_bypass_depth < 0);
|
||||
if (scx_ops_bypass_depth != 0)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -4326,7 +4329,7 @@ static void scx_ops_bypass(bool bypass)
|
|||
struct rq_flags rf;
|
||||
struct task_struct *p, *n;
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
rq_lock(rq, &rf);
|
||||
|
||||
if (bypass) {
|
||||
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
|
||||
|
@ -4362,11 +4365,13 @@ static void scx_ops_bypass(bool bypass)
|
|||
sched_enq_and_set_task(&ctx);
|
||||
}
|
||||
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
rq_unlock(rq, &rf);
|
||||
|
||||
/* resched to restore ticks and idle state */
|
||||
resched_cpu(cpu);
|
||||
}
|
||||
unlock:
|
||||
raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
|
||||
}
|
||||
|
||||
static void free_exit_info(struct scx_exit_info *ei)
|
||||
|
|
|
@ -141,7 +141,7 @@ static void test_kmalloc_redzone_access(struct kunit *test)
|
|||
{
|
||||
struct kmem_cache *s = test_kmem_cache_create("TestSlub_RZ_kmalloc", 32,
|
||||
SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE);
|
||||
u8 *p = __kmalloc_cache_noprof(s, GFP_KERNEL, 18);
|
||||
u8 *p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 18));
|
||||
|
||||
kasan_disable_current();
|
||||
|
||||
|
|
|
@ -1085,7 +1085,6 @@ config HMM_MIRROR
|
|||
depends on MMU
|
||||
|
||||
config GET_FREE_REGION
|
||||
depends on SPARSEMEM
|
||||
bool
|
||||
|
||||
config DEVICE_PRIVATE
|
||||
|
|
15
mm/memory.c
15
mm/memory.c
|
@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
|
|||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
|
||||
|
||||
/*
|
||||
* We enter with non-exclusive mmap_lock (to exclude vma changes,
|
||||
* but allow concurrent faults), and pte mapped but not yet locked.
|
||||
|
@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
{
|
||||
struct vm_area_struct *vma = vmf->vma;
|
||||
struct folio *swapcache, *folio = NULL;
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
struct page *page;
|
||||
struct swap_info_struct *si = NULL;
|
||||
rmap_t rmap_flags = RMAP_NONE;
|
||||
|
@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
* Relax a bit to prevent rapid
|
||||
* repeated page faults.
|
||||
*/
|
||||
add_wait_queue(&swapcache_wq, &wait);
|
||||
schedule_timeout_uninterruptible(1);
|
||||
remove_wait_queue(&swapcache_wq, &wait);
|
||||
goto out_page;
|
||||
}
|
||||
need_clear_cache = true;
|
||||
|
@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
pte_unmap_unlock(vmf->pte, vmf->ptl);
|
||||
out:
|
||||
/* Clear the swap cache pin for direct swapin after PTL unlock */
|
||||
if (need_clear_cache)
|
||||
if (need_clear_cache) {
|
||||
swapcache_clear(si, entry, nr_pages);
|
||||
if (waitqueue_active(&swapcache_wq))
|
||||
wake_up(&swapcache_wq);
|
||||
}
|
||||
if (si)
|
||||
put_swap_device(si);
|
||||
return ret;
|
||||
|
@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
folio_unlock(swapcache);
|
||||
folio_put(swapcache);
|
||||
}
|
||||
if (need_clear_cache)
|
||||
if (need_clear_cache) {
|
||||
swapcache_clear(si, entry, nr_pages);
|
||||
if (waitqueue_active(&swapcache_wq))
|
||||
wake_up(&swapcache_wq);
|
||||
}
|
||||
if (si)
|
||||
put_swap_device(si);
|
||||
return ret;
|
||||
|
|
84
mm/mmap.c
84
mm/mmap.c
|
@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
|||
vmg.flags = vm_flags;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear PTEs while the vma is still in the tree so that rmap
|
||||
* cannot race with the freeing later in the truncate scenario.
|
||||
* This is also needed for call_mmap(), which is why vm_ops
|
||||
* close function is called.
|
||||
*/
|
||||
vms_clean_up_area(&vms, &mas_detach);
|
||||
vma = vma_merge_new_range(&vmg);
|
||||
if (vma)
|
||||
goto expanded;
|
||||
|
@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
|||
|
||||
if (file) {
|
||||
vma->vm_file = get_file(file);
|
||||
/*
|
||||
* call_mmap() may map PTE, so ensure there are no existing PTEs
|
||||
* and call the vm_ops close function if one exists.
|
||||
*/
|
||||
vms_clean_up_area(&vms, &mas_detach);
|
||||
error = call_mmap(file, vma);
|
||||
if (error)
|
||||
goto unmap_and_free_vma;
|
||||
|
@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
unsigned long populate = 0;
|
||||
unsigned long ret = -EINVAL;
|
||||
struct file *file;
|
||||
vm_flags_t vm_flags;
|
||||
|
||||
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
|
||||
current->comm, current->pid);
|
||||
|
@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
|
||||
return ret;
|
||||
|
||||
if (mmap_write_lock_killable(mm))
|
||||
if (mmap_read_lock_killable(mm))
|
||||
return -EINTR;
|
||||
|
||||
/*
|
||||
* Look up VMA under read lock first so we can perform the security
|
||||
* without holding locks (which can be problematic). We reacquire a
|
||||
* write lock later and check nothing changed underneath us.
|
||||
*/
|
||||
vma = vma_lookup(mm, start);
|
||||
|
||||
if (!vma || !(vma->vm_flags & VM_SHARED)) {
|
||||
mmap_read_unlock(mm);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
|
||||
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
|
||||
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
|
||||
|
||||
flags &= MAP_NONBLOCK;
|
||||
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
flags |= MAP_LOCKED;
|
||||
|
||||
/* Save vm_flags used to calculate prot and flags, and recheck later. */
|
||||
vm_flags = vma->vm_flags;
|
||||
file = get_file(vma->vm_file);
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
/* Call outside mmap_lock to be consistent with other callers. */
|
||||
ret = security_mmap_file(file, prot, flags);
|
||||
if (ret) {
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = -EINVAL;
|
||||
|
||||
/* OK security check passed, take write lock + let it rip. */
|
||||
if (mmap_write_lock_killable(mm)) {
|
||||
fput(file);
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
vma = vma_lookup(mm, start);
|
||||
|
||||
if (!vma || !(vma->vm_flags & VM_SHARED))
|
||||
if (!vma)
|
||||
goto out;
|
||||
|
||||
/* Make sure things didn't change under us. */
|
||||
if (vma->vm_flags != vm_flags)
|
||||
goto out;
|
||||
if (vma->vm_file != file)
|
||||
goto out;
|
||||
|
||||
if (start + size > vma->vm_end) {
|
||||
|
@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
goto out;
|
||||
}
|
||||
|
||||
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
|
||||
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
|
||||
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
|
||||
|
||||
flags &= MAP_NONBLOCK;
|
||||
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
flags |= MAP_LOCKED;
|
||||
|
||||
file = get_file(vma->vm_file);
|
||||
ret = security_mmap_file(vma->vm_file, prot, flags);
|
||||
if (ret)
|
||||
goto out_fput;
|
||||
ret = do_mmap(vma->vm_file, start, size,
|
||||
prot, flags, 0, pgoff, &populate, NULL);
|
||||
out_fput:
|
||||
fput(file);
|
||||
out:
|
||||
mmap_write_unlock(mm);
|
||||
fput(file);
|
||||
if (populate)
|
||||
mm_populate(ret, populate);
|
||||
if (!IS_ERR_VALUE(ret))
|
||||
|
@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|||
VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
|
||||
|
||||
vmg.prev = vma;
|
||||
vma_iter_next_range(vmi);
|
||||
/* vmi is positioned at prev, which this mode expects. */
|
||||
vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
|
||||
|
||||
if (vma_merge_new_range(&vmg))
|
||||
goto out;
|
||||
|
|
|
@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
|
|||
for_each_reserved_mem_region(mb_region) {
|
||||
int nid = memblock_get_region_node(mb_region);
|
||||
|
||||
if (nid != MAX_NUMNODES)
|
||||
if (numa_valid_node(nid))
|
||||
node_set(nid, reserved_nodemask);
|
||||
}
|
||||
|
||||
|
|
|
@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
|
|||
page = __rmqueue(zone, order, migratetype, alloc_flags);
|
||||
|
||||
/*
|
||||
* If the allocation fails, allow OOM handling access
|
||||
* to HIGHATOMIC reserves as failing now is worse than
|
||||
* failing a high-order atomic allocation in the
|
||||
* future.
|
||||
* If the allocation fails, allow OOM handling and
|
||||
* order-0 (atomic) allocs access to HIGHATOMIC
|
||||
* reserves as failing now is worse than failing a
|
||||
* high-order atomic allocation in the future.
|
||||
*/
|
||||
if (!page && (alloc_flags & ALLOC_OOM))
|
||||
if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
|
||||
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
|
||||
|
||||
if (!page) {
|
||||
|
|
|
@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
pud = pudp_get(pudp);
|
||||
if (pud_none(pud))
|
||||
goto not_found;
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
|
||||
(!pud_present(pud) || pud_leaf(pud))) {
|
||||
ptl = pud_lock(vma->vm_mm, pudp);
|
||||
pud = pudp_get(pudp);
|
||||
|
||||
|
@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
fw->pudp = pudp;
|
||||
fw->pud = pud;
|
||||
|
||||
/*
|
||||
* TODO: FW_MIGRATION support for PUD migration entries
|
||||
* once there are relevant users.
|
||||
*/
|
||||
if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
|
||||
spin_unlock(ptl);
|
||||
goto not_found;
|
||||
|
@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
}
|
||||
|
||||
pmd_table:
|
||||
VM_WARN_ON_ONCE(pud_leaf(*pudp));
|
||||
VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
|
||||
pmdp = pmd_offset(pudp, addr);
|
||||
pmd = pmdp_get_lockless(pmdp);
|
||||
if (pmd_none(pmd))
|
||||
goto not_found;
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
|
||||
(!pmd_present(pmd) || pmd_leaf(pmd))) {
|
||||
ptl = pmd_lock(vma->vm_mm, pmdp);
|
||||
pmd = pmdp_get(pmdp);
|
||||
|
||||
|
@ -786,7 +792,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
if (pmd_none(pmd)) {
|
||||
spin_unlock(ptl);
|
||||
goto not_found;
|
||||
} else if (!pmd_leaf(pmd)) {
|
||||
} else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
|
||||
spin_unlock(ptl);
|
||||
goto pte_table;
|
||||
} else if (pmd_present(pmd)) {
|
||||
|
@ -812,7 +818,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
}
|
||||
|
||||
pte_table:
|
||||
VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
|
||||
VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
|
||||
ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
|
||||
if (!ptep)
|
||||
goto not_found;
|
||||
|
|
|
@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
|
|||
stat->attributes_mask |= (STATX_ATTR_APPEND |
|
||||
STATX_ATTR_IMMUTABLE |
|
||||
STATX_ATTR_NODUMP);
|
||||
inode_lock_shared(inode);
|
||||
generic_fillattr(idmap, request_mask, inode, stat);
|
||||
inode_unlock_shared(inode);
|
||||
|
||||
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
|
||||
stat->blksize = HPAGE_PMD_SIZE;
|
||||
|
|
|
@ -1209,7 +1209,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
|
|||
/* Zero out spare memory. */
|
||||
if (want_init_on_alloc(flags)) {
|
||||
kasan_disable_current();
|
||||
memset((void *)p + new_size, 0, ks - new_size);
|
||||
memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
|
||||
kasan_enable_current();
|
||||
}
|
||||
|
||||
|
|
23
mm/vma.c
23
mm/vma.c
|
@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
pgoff_t pgoff = vmg->pgoff;
|
||||
pgoff_t pglen = PHYS_PFN(end - start);
|
||||
bool can_merge_left, can_merge_right;
|
||||
bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
|
||||
|
||||
mmap_assert_write_locked(vmg->mm);
|
||||
VM_WARN_ON(vmg->vma);
|
||||
|
@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
return NULL;
|
||||
|
||||
can_merge_left = can_vma_merge_left(vmg);
|
||||
can_merge_right = can_vma_merge_right(vmg, can_merge_left);
|
||||
can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
|
||||
|
||||
/* If we can merge with the next VMA, adjust vmg accordingly. */
|
||||
if (can_merge_right) {
|
||||
|
@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
if (can_merge_right && !can_merge_remove_vma(next))
|
||||
vmg->end = end;
|
||||
|
||||
vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
|
||||
/* In expand-only case we are already positioned at prev. */
|
||||
if (!just_expand) {
|
||||
/* Equivalent to going to the previous range. */
|
||||
vma_prev(vmg->vmi);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
}
|
||||
|
||||
/* If expansion failed, reset state. Allows us to retry merge later. */
|
||||
vmg->vma = NULL;
|
||||
vmg->start = start;
|
||||
vmg->end = end;
|
||||
vmg->pgoff = pgoff;
|
||||
if (vmg->vma == prev)
|
||||
vma_iter_set(vmg->vmi, start);
|
||||
if (!just_expand) {
|
||||
vmg->vma = NULL;
|
||||
vmg->start = start;
|
||||
vmg->end = end;
|
||||
vmg->pgoff = pgoff;
|
||||
if (vmg->vma == prev)
|
||||
vma_iter_set(vmg->vmi, start);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
|
26
mm/vma.h
26
mm/vma.h
|
@ -59,6 +59,17 @@ enum vma_merge_state {
|
|||
VMA_MERGE_SUCCESS,
|
||||
};
|
||||
|
||||
enum vma_merge_flags {
|
||||
VMG_FLAG_DEFAULT = 0,
|
||||
/*
|
||||
* If we can expand, simply do so. We know there is nothing to merge to
|
||||
* the right. Does not reset state upon failure to merge. The VMA
|
||||
* iterator is assumed to be positioned at the previous VMA, rather than
|
||||
* at the gap.
|
||||
*/
|
||||
VMG_FLAG_JUST_EXPAND = 1 << 0,
|
||||
};
|
||||
|
||||
/* Represents a VMA merge operation. */
|
||||
struct vma_merge_struct {
|
||||
struct mm_struct *mm;
|
||||
|
@ -75,6 +86,7 @@ struct vma_merge_struct {
|
|||
struct mempolicy *policy;
|
||||
struct vm_userfaultfd_ctx uffd_ctx;
|
||||
struct anon_vma_name *anon_name;
|
||||
enum vma_merge_flags merge_flags;
|
||||
enum vma_merge_state state;
|
||||
};
|
||||
|
||||
|
@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
|
|||
.flags = flags_, \
|
||||
.pgoff = pgoff_, \
|
||||
.state = VMA_MERGE_START, \
|
||||
.merge_flags = VMG_FLAG_DEFAULT, \
|
||||
}
|
||||
|
||||
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
|
||||
|
@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
|
|||
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
|
||||
.anon_name = anon_vma_name(vma_), \
|
||||
.state = VMA_MERGE_START, \
|
||||
.merge_flags = VMG_FLAG_DEFAULT, \
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
|
||||
|
@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
|
|||
* failure method of leaving a gap where the MAP_FIXED mapping failed.
|
||||
*/
|
||||
mas_set_range(mas, vms->start, vms->end - 1);
|
||||
if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
|
||||
pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
|
||||
current->comm, current->pid);
|
||||
/* Leaving vmas detached and in-tree may hamper recovery */
|
||||
reattach_vmas(mas_detach);
|
||||
} else {
|
||||
/* Clean up the insertion of the unfortunate gap */
|
||||
vms_complete_munmap_vmas(vms, mas_detach);
|
||||
}
|
||||
mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
|
||||
/* Clean up the insertion of the unfortunate gap */
|
||||
vms_complete_munmap_vmas(vms, mas_detach);
|
||||
}
|
||||
|
||||
int
|
||||
|
|
|
@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
|
|||
/*
|
||||
* Access a cpumask in read-only mode (typically to check bits).
|
||||
*/
|
||||
const struct cpumask *cast_mask(struct bpf_cpumask *mask)
|
||||
static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
|
||||
{
|
||||
return (const struct cpumask *)mask;
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
|
|||
unsigned long long *count_verify;
|
||||
uffd_test_ops_t *uffd_test_ops;
|
||||
uffd_test_case_ops_t *uffd_test_case_ops;
|
||||
pthread_barrier_t ready_for_fork;
|
||||
atomic_bool ready_for_fork;
|
||||
|
||||
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
|
||||
{
|
||||
|
@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
|
|||
pollfd[1].fd = pipefd[cpu*2];
|
||||
pollfd[1].events = POLLIN;
|
||||
|
||||
/* Ready for parent thread to fork */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
ready_for_fork = true;
|
||||
|
||||
for (;;) {
|
||||
ret = poll(pollfd, 2, -1);
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/random.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
#include "vm_util.h"
|
||||
|
@ -104,7 +105,7 @@ extern bool map_shared;
|
|||
extern bool test_uffdio_wp;
|
||||
extern unsigned long long *count_verify;
|
||||
extern volatile bool test_uffdio_copy_eexist;
|
||||
extern pthread_barrier_t ready_for_fork;
|
||||
extern atomic_bool ready_for_fork;
|
||||
|
||||
extern uffd_test_ops_t anon_uffd_test_ops;
|
||||
extern uffd_test_ops_t shmem_uffd_test_ops;
|
||||
|
|
|
@ -241,8 +241,7 @@ static void *fork_event_consumer(void *data)
|
|||
fork_event_args *args = data;
|
||||
struct uffd_msg msg = { 0 };
|
||||
|
||||
/* Ready for parent thread to fork */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
ready_for_fork = true;
|
||||
|
||||
/* Read until a full msg received */
|
||||
while (uffd_read_msg(args->parent_uffd, &msg));
|
||||
|
@ -311,12 +310,11 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
|
|||
|
||||
/* Prepare a thread to resolve EVENT_FORK */
|
||||
if (with_event) {
|
||||
pthread_barrier_init(&ready_for_fork, NULL, 2);
|
||||
ready_for_fork = false;
|
||||
if (pthread_create(&thread, NULL, fork_event_consumer, &args))
|
||||
err("pthread_create()");
|
||||
/* Wait for child thread to start before forking */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
pthread_barrier_destroy(&ready_for_fork);
|
||||
while (!ready_for_fork)
|
||||
; /* Wait for the poll_thread to start executing before forking */
|
||||
}
|
||||
|
||||
child = fork();
|
||||
|
@ -781,7 +779,7 @@ static void uffd_sigbus_test_common(bool wp)
|
|||
char c;
|
||||
struct uffd_args args = { 0 };
|
||||
|
||||
pthread_barrier_init(&ready_for_fork, NULL, 2);
|
||||
ready_for_fork = false;
|
||||
|
||||
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
|
||||
|
||||
|
@ -798,9 +796,8 @@ static void uffd_sigbus_test_common(bool wp)
|
|||
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
|
||||
err("uffd_poll_thread create");
|
||||
|
||||
/* Wait for child thread to start before forking */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
pthread_barrier_destroy(&ready_for_fork);
|
||||
while (!ready_for_fork)
|
||||
; /* Wait for the poll_thread to start executing before forking */
|
||||
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
|
@ -841,7 +838,7 @@ static void uffd_events_test_common(bool wp)
|
|||
char c;
|
||||
struct uffd_args args = { 0 };
|
||||
|
||||
pthread_barrier_init(&ready_for_fork, NULL, 2);
|
||||
ready_for_fork = false;
|
||||
|
||||
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
|
||||
if (uffd_register(uffd, area_dst, nr_pages * page_size,
|
||||
|
@ -852,9 +849,8 @@ static void uffd_events_test_common(bool wp)
|
|||
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
|
||||
err("uffd_poll_thread create");
|
||||
|
||||
/* Wait for child thread to start before forking */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
pthread_barrier_destroy(&ready_for_fork);
|
||||
while (!ready_for_fork)
|
||||
; /* Wait for the poll_thread to start executing before forking */
|
||||
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
|
|
|
@ -184,7 +184,7 @@ auto-test-targets := \
|
|||
|
||||
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
|
||||
|
||||
$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
|
||||
$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) $(BPFOBJ)
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# Create all of the test targets object files, whose testcase objects will be
|
||||
|
|
|
@ -51,8 +51,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops create_dsq_ops = {
|
||||
.init_task = create_dsq_init_task,
|
||||
.exit_task = create_dsq_exit_task,
|
||||
.init = create_dsq_init,
|
||||
.init_task = (void *) create_dsq_init_task,
|
||||
.exit_task = (void *) create_dsq_exit_task,
|
||||
.init = (void *) create_dsq_init,
|
||||
.name = "create_dsq",
|
||||
};
|
||||
|
|
|
@ -35,8 +35,8 @@ void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
|
||||
.select_cpu = ddsp_bogus_dsq_fail_select_cpu,
|
||||
.exit = ddsp_bogus_dsq_fail_exit,
|
||||
.select_cpu = (void *) ddsp_bogus_dsq_fail_select_cpu,
|
||||
.exit = (void *) ddsp_bogus_dsq_fail_exit,
|
||||
.name = "ddsp_bogus_dsq_fail",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -32,8 +32,8 @@ void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
|
||||
.select_cpu = ddsp_vtimelocal_fail_select_cpu,
|
||||
.exit = ddsp_vtimelocal_fail_exit,
|
||||
.select_cpu = (void *) ddsp_vtimelocal_fail_select_cpu,
|
||||
.exit = (void *) ddsp_vtimelocal_fail_exit,
|
||||
.name = "ddsp_vtimelocal_fail",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -56,10 +56,10 @@ void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops dsp_local_on_ops = {
|
||||
.select_cpu = dsp_local_on_select_cpu,
|
||||
.enqueue = dsp_local_on_enqueue,
|
||||
.dispatch = dsp_local_on_dispatch,
|
||||
.exit = dsp_local_on_exit,
|
||||
.select_cpu = (void *) dsp_local_on_select_cpu,
|
||||
.enqueue = (void *) dsp_local_on_enqueue,
|
||||
.dispatch = (void *) dsp_local_on_dispatch,
|
||||
.exit = (void *) dsp_local_on_exit,
|
||||
.name = "dsp_local_on",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -12,10 +12,18 @@
|
|||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
u32 exit_kind;
|
||||
|
||||
void BPF_STRUCT_OPS_SLEEPABLE(enq_last_no_enq_fails_exit, struct scx_exit_info *info)
|
||||
{
|
||||
exit_kind = info->kind;
|
||||
}
|
||||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops enq_last_no_enq_fails_ops = {
|
||||
.name = "enq_last_no_enq_fails",
|
||||
/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
|
||||
.flags = SCX_OPS_ENQ_LAST,
|
||||
.exit = (void *) enq_last_no_enq_fails_exit,
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -31,8 +31,12 @@ static enum scx_test_status run(void *ctx)
|
|||
struct bpf_link *link;
|
||||
|
||||
link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
|
||||
if (link) {
|
||||
SCX_ERR("Incorrectly succeeded in to attaching scheduler");
|
||||
if (!link) {
|
||||
SCX_ERR("Incorrectly failed at attaching scheduler");
|
||||
return SCX_TEST_FAIL;
|
||||
}
|
||||
if (!skel->bss->exit_kind) {
|
||||
SCX_ERR("Incorrectly stayed loaded");
|
||||
return SCX_TEST_FAIL;
|
||||
}
|
||||
|
||||
|
@ -50,7 +54,7 @@ static void cleanup(void *ctx)
|
|||
|
||||
struct scx_test enq_last_no_enq_fails = {
|
||||
.name = "enq_last_no_enq_fails",
|
||||
.description = "Verify we fail to load a scheduler if we specify "
|
||||
.description = "Verify we eject a scheduler if we specify "
|
||||
"the SCX_OPS_ENQ_LAST flag without defining "
|
||||
"ops.enqueue()",
|
||||
.setup = setup,
|
||||
|
|
|
@ -36,8 +36,8 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops enq_select_cpu_fails_ops = {
|
||||
.select_cpu = enq_select_cpu_fails_select_cpu,
|
||||
.enqueue = enq_select_cpu_fails_enqueue,
|
||||
.select_cpu = (void *) enq_select_cpu_fails_select_cpu,
|
||||
.enqueue = (void *) enq_select_cpu_fails_enqueue,
|
||||
.name = "enq_select_cpu_fails",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -15,6 +15,8 @@ UEI_DEFINE(uei);
|
|||
|
||||
#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point)
|
||||
|
||||
#define DSQ_ID 0
|
||||
|
||||
s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p,
|
||||
s32 prev_cpu, u64 wake_flags)
|
||||
{
|
||||
|
@ -31,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
|
|||
if (exit_point == EXIT_ENQUEUE)
|
||||
EXIT_CLEANLY();
|
||||
|
||||
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
|
||||
scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
|
||||
|
@ -39,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
|
|||
if (exit_point == EXIT_DISPATCH)
|
||||
EXIT_CLEANLY();
|
||||
|
||||
scx_bpf_consume(SCX_DSQ_GLOBAL);
|
||||
scx_bpf_consume(DSQ_ID);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
|
||||
|
@ -67,18 +69,18 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init)
|
|||
if (exit_point == EXIT_INIT)
|
||||
EXIT_CLEANLY();
|
||||
|
||||
return 0;
|
||||
return scx_bpf_create_dsq(DSQ_ID, -1);
|
||||
}
|
||||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops exit_ops = {
|
||||
.select_cpu = exit_select_cpu,
|
||||
.enqueue = exit_enqueue,
|
||||
.dispatch = exit_dispatch,
|
||||
.init_task = exit_init_task,
|
||||
.enable = exit_enable,
|
||||
.exit = exit_exit,
|
||||
.init = exit_init,
|
||||
.select_cpu = (void *) exit_select_cpu,
|
||||
.enqueue = (void *) exit_enqueue,
|
||||
.dispatch = (void *) exit_dispatch,
|
||||
.init_task = (void *) exit_init_task,
|
||||
.enable = (void *) exit_enable,
|
||||
.exit = (void *) exit_exit,
|
||||
.init = (void *) exit_init,
|
||||
.name = "exit",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -46,16 +46,16 @@ void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops hotplug_cb_ops = {
|
||||
.cpu_online = hotplug_cpu_online,
|
||||
.cpu_offline = hotplug_cpu_offline,
|
||||
.exit = hotplug_exit,
|
||||
.cpu_online = (void *) hotplug_cpu_online,
|
||||
.cpu_offline = (void *) hotplug_cpu_offline,
|
||||
.exit = (void *) hotplug_exit,
|
||||
.name = "hotplug_cbs",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops hotplug_nocb_ops = {
|
||||
.exit = hotplug_exit,
|
||||
.exit = (void *) hotplug_exit,
|
||||
.name = "hotplug_nocbs",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -45,9 +45,9 @@ void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops init_enable_count_ops = {
|
||||
.init_task = cnt_init_task,
|
||||
.exit_task = cnt_exit_task,
|
||||
.enable = cnt_enable,
|
||||
.disable = cnt_disable,
|
||||
.init_task = (void *) cnt_init_task,
|
||||
.exit_task = (void *) cnt_exit_task,
|
||||
.enable = (void *) cnt_enable,
|
||||
.disable = (void *) cnt_disable,
|
||||
.name = "init_enable_count",
|
||||
};
|
||||
|
|
|
@ -131,34 +131,34 @@ void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops maximal_ops = {
|
||||
.select_cpu = maximal_select_cpu,
|
||||
.enqueue = maximal_enqueue,
|
||||
.dequeue = maximal_dequeue,
|
||||
.dispatch = maximal_dispatch,
|
||||
.runnable = maximal_runnable,
|
||||
.running = maximal_running,
|
||||
.stopping = maximal_stopping,
|
||||
.quiescent = maximal_quiescent,
|
||||
.yield = maximal_yield,
|
||||
.core_sched_before = maximal_core_sched_before,
|
||||
.set_weight = maximal_set_weight,
|
||||
.set_cpumask = maximal_set_cpumask,
|
||||
.update_idle = maximal_update_idle,
|
||||
.cpu_acquire = maximal_cpu_acquire,
|
||||
.cpu_release = maximal_cpu_release,
|
||||
.cpu_online = maximal_cpu_online,
|
||||
.cpu_offline = maximal_cpu_offline,
|
||||
.init_task = maximal_init_task,
|
||||
.enable = maximal_enable,
|
||||
.exit_task = maximal_exit_task,
|
||||
.disable = maximal_disable,
|
||||
.cgroup_init = maximal_cgroup_init,
|
||||
.cgroup_exit = maximal_cgroup_exit,
|
||||
.cgroup_prep_move = maximal_cgroup_prep_move,
|
||||
.cgroup_move = maximal_cgroup_move,
|
||||
.cgroup_cancel_move = maximal_cgroup_cancel_move,
|
||||
.cgroup_set_weight = maximal_cgroup_set_weight,
|
||||
.init = maximal_init,
|
||||
.exit = maximal_exit,
|
||||
.select_cpu = (void *) maximal_select_cpu,
|
||||
.enqueue = (void *) maximal_enqueue,
|
||||
.dequeue = (void *) maximal_dequeue,
|
||||
.dispatch = (void *) maximal_dispatch,
|
||||
.runnable = (void *) maximal_runnable,
|
||||
.running = (void *) maximal_running,
|
||||
.stopping = (void *) maximal_stopping,
|
||||
.quiescent = (void *) maximal_quiescent,
|
||||
.yield = (void *) maximal_yield,
|
||||
.core_sched_before = (void *) maximal_core_sched_before,
|
||||
.set_weight = (void *) maximal_set_weight,
|
||||
.set_cpumask = (void *) maximal_set_cpumask,
|
||||
.update_idle = (void *) maximal_update_idle,
|
||||
.cpu_acquire = (void *) maximal_cpu_acquire,
|
||||
.cpu_release = (void *) maximal_cpu_release,
|
||||
.cpu_online = (void *) maximal_cpu_online,
|
||||
.cpu_offline = (void *) maximal_cpu_offline,
|
||||
.init_task = (void *) maximal_init_task,
|
||||
.enable = (void *) maximal_enable,
|
||||
.exit_task = (void *) maximal_exit_task,
|
||||
.disable = (void *) maximal_disable,
|
||||
.cgroup_init = (void *) maximal_cgroup_init,
|
||||
.cgroup_exit = (void *) maximal_cgroup_exit,
|
||||
.cgroup_prep_move = (void *) maximal_cgroup_prep_move,
|
||||
.cgroup_move = (void *) maximal_cgroup_move,
|
||||
.cgroup_cancel_move = (void *) maximal_cgroup_cancel_move,
|
||||
.cgroup_set_weight = (void *) maximal_cgroup_set_weight,
|
||||
.init = (void *) maximal_init,
|
||||
.exit = (void *) maximal_exit,
|
||||
.name = "maximal",
|
||||
};
|
||||
|
|
|
@ -29,8 +29,8 @@ bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from,
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops maybe_null_success = {
|
||||
.dispatch = maybe_null_success_dispatch,
|
||||
.yield = maybe_null_success_yield,
|
||||
.enable = maybe_null_running,
|
||||
.dispatch = (void *) maybe_null_success_dispatch,
|
||||
.yield = (void *) maybe_null_success_yield,
|
||||
.enable = (void *) maybe_null_running,
|
||||
.name = "minimal",
|
||||
};
|
||||
|
|
|
@ -19,7 +19,7 @@ void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops maybe_null_fail = {
|
||||
.dispatch = maybe_null_fail_dispatch,
|
||||
.enable = maybe_null_running,
|
||||
.dispatch = (void *) maybe_null_fail_dispatch,
|
||||
.enable = (void *) maybe_null_running,
|
||||
.name = "maybe_null_fail_dispatch",
|
||||
};
|
||||
|
|
|
@ -22,7 +22,7 @@ bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from,
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops maybe_null_fail = {
|
||||
.yield = maybe_null_fail_yield,
|
||||
.enable = maybe_null_running,
|
||||
.yield = (void *) maybe_null_fail_yield,
|
||||
.enable = (void *) maybe_null_running,
|
||||
.name = "maybe_null_fail_yield",
|
||||
};
|
||||
|
|
|
@ -28,6 +28,6 @@ void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops prog_run_ops = {
|
||||
.exit = prog_run_exit,
|
||||
.exit = (void *) prog_run_exit,
|
||||
.name = "prog_run",
|
||||
};
|
||||
|
|
|
@ -35,6 +35,6 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops select_cpu_dfl_ops = {
|
||||
.enqueue = select_cpu_dfl_enqueue,
|
||||
.enqueue = (void *) select_cpu_dfl_enqueue,
|
||||
.name = "select_cpu_dfl",
|
||||
};
|
||||
|
|
|
@ -82,8 +82,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
|
||||
.select_cpu = select_cpu_dfl_nodispatch_select_cpu,
|
||||
.enqueue = select_cpu_dfl_nodispatch_enqueue,
|
||||
.init_task = select_cpu_dfl_nodispatch_init_task,
|
||||
.select_cpu = (void *) select_cpu_dfl_nodispatch_select_cpu,
|
||||
.enqueue = (void *) select_cpu_dfl_nodispatch_enqueue,
|
||||
.init_task = (void *) select_cpu_dfl_nodispatch_init_task,
|
||||
.name = "select_cpu_dfl_nodispatch",
|
||||
};
|
||||
|
|
|
@ -35,7 +35,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops select_cpu_dispatch_ops = {
|
||||
.select_cpu = select_cpu_dispatch_select_cpu,
|
||||
.select_cpu = (void *) select_cpu_dispatch_select_cpu,
|
||||
.name = "select_cpu_dispatch",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -30,8 +30,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
|
||||
.select_cpu = select_cpu_dispatch_bad_dsq_select_cpu,
|
||||
.exit = select_cpu_dispatch_bad_dsq_exit,
|
||||
.select_cpu = (void *) select_cpu_dispatch_bad_dsq_select_cpu,
|
||||
.exit = (void *) select_cpu_dispatch_bad_dsq_exit,
|
||||
.name = "select_cpu_dispatch_bad_dsq",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -31,8 +31,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
|
||||
.select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu,
|
||||
.exit = select_cpu_dispatch_dbl_dsp_exit,
|
||||
.select_cpu = (void *) select_cpu_dispatch_dbl_dsp_select_cpu,
|
||||
.exit = (void *) select_cpu_dispatch_dbl_dsp_exit,
|
||||
.name = "select_cpu_dispatch_dbl_dsp",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -81,12 +81,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
|
|||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops select_cpu_vtime_ops = {
|
||||
.select_cpu = select_cpu_vtime_select_cpu,
|
||||
.dispatch = select_cpu_vtime_dispatch,
|
||||
.running = select_cpu_vtime_running,
|
||||
.stopping = select_cpu_vtime_stopping,
|
||||
.enable = select_cpu_vtime_enable,
|
||||
.init = select_cpu_vtime_init,
|
||||
.select_cpu = (void *) select_cpu_vtime_select_cpu,
|
||||
.dispatch = (void *) select_cpu_vtime_dispatch,
|
||||
.running = (void *) select_cpu_vtime_running,
|
||||
.stopping = (void *) select_cpu_vtime_stopping,
|
||||
.enable = (void *) select_cpu_vtime_enable,
|
||||
.init = (void *) select_cpu_vtime_init,
|
||||
.name = "select_cpu_vtime",
|
||||
.timeout_ms = 1000U,
|
||||
};
|
||||
|
|
|
@ -1522,6 +1522,45 @@ static bool test_copy_vma(void)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool test_expand_only_mode(void)
|
||||
{
|
||||
unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
||||
struct mm_struct mm = {};
|
||||
VMA_ITERATOR(vmi, &mm, 0);
|
||||
struct vm_area_struct *vma_prev, *vma;
|
||||
VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5);
|
||||
|
||||
/*
|
||||
* Place a VMA prior to the one we're expanding so we assert that we do
|
||||
* not erroneously try to traverse to the previous VMA even though we
|
||||
* have, through the use of VMG_FLAG_JUST_EXPAND, indicated we do not
|
||||
* need to do so.
|
||||
*/
|
||||
alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
|
||||
|
||||
/*
|
||||
* We will be positioned at the prev VMA, but looking to expand to
|
||||
* 0x9000.
|
||||
*/
|
||||
vma_iter_set(&vmi, 0x3000);
|
||||
vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
|
||||
vmg.prev = vma_prev;
|
||||
vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
|
||||
|
||||
vma = vma_merge_new_range(&vmg);
|
||||
ASSERT_NE(vma, NULL);
|
||||
ASSERT_EQ(vma, vma_prev);
|
||||
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
||||
ASSERT_EQ(vma->vm_start, 0x3000);
|
||||
ASSERT_EQ(vma->vm_end, 0x9000);
|
||||
ASSERT_EQ(vma->vm_pgoff, 3);
|
||||
ASSERT_TRUE(vma_write_started(vma));
|
||||
ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
|
||||
|
||||
cleanup_mm(&mm, &vmi);
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int num_tests = 0, num_fail = 0;
|
||||
|
@ -1553,6 +1592,7 @@ int main(void)
|
|||
TEST(vmi_prealloc_fail);
|
||||
TEST(merge_extend);
|
||||
TEST(copy_vma);
|
||||
TEST(expand_only_mode);
|
||||
|
||||
#undef TEST
|
||||
|
||||
|
|
Loading…
Reference in a new issue