mirror of
https://github.com/torvalds/linux.git
synced 2024-11-01 04:53:36 +01:00
21 hotfixes. 13 are cc:stable. 13 are MM and 8 are non-MM.
No particular theme here - mainly singletons, a couple of doubletons. Please see the changelogs. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZyBpwAAKCRDdBJ7gKXxA jt9XAPsEfjtMc6wtcII5zXLXbLbznnCenaX0bSOmAHMQsQS63QEAp/JTyjN1rBjm DExd7kbYx9ya61fnBLZ2WfEMm0Sbigc= =PIza -----END PGP SIGNATURE----- Merge tag 'mm-hotfixes-stable-2024-10-28-21-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "21 hotfixes. 13 are cc:stable. 13 are MM and 8 are non-MM. No particular theme here - mainly singletons, a couple of doubletons. Please see the changelogs" * tag 'mm-hotfixes-stable-2024-10-28-21-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (21 commits) mm: avoid unconditional one-tick sleep when swapcache_prepare fails mseal: update mseal.rst mm: split critical region in remap_file_pages() and invoke LSMs in between selftests/mm: fix deadlock for fork after pthread_create with atomic_bool Revert "selftests/mm: replace atomic_bool with pthread_barrier_t" Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM" tools: testing: add expand-only mode VMA test mm/vma: add expand-only VMA merge mode and optimise do_brk_flags() resource,kexec: walk_system_ram_res_rev must retain resource flags nilfs2: fix kernel bug due to missing clearing of checked flag mm: numa_clear_kernel_node_hotplug: Add NUMA_NO_NODE check for node id ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow mm: shmem: fix data-race in shmem_getattr() mm: mark mas allocation in vms_abort_munmap_vmas as __GFP_NOFAIL x86/traps: move kmsan check after instrumentation_begin resource: remove dependency on SPARSEMEM from GET_FREE_REGION mm/mmap: fix race in mmap_region() with ftruncate() mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves fork: only invoke khugepaged, ksm hooks if no error fork: do not invoke uffd on fork if error occurs ...
This commit is contained in:
commit
9251e3e93c
22 changed files with 363 additions and 229 deletions
|
@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime.
|
|||
A similar feature already exists in the XNU kernel with the
|
||||
VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].
|
||||
|
||||
User API
|
||||
========
|
||||
mseal()
|
||||
-----------
|
||||
The mseal() syscall has the following signature:
|
||||
SYSCALL
|
||||
=======
|
||||
mseal syscall signature
|
||||
-----------------------
|
||||
``int mseal(void \* addr, size_t len, unsigned long flags)``
|
||||
|
||||
``int mseal(void addr, size_t len, unsigned long flags)``
|
||||
**addr**/**len**: virtual memory address range.
|
||||
The address range set by **addr**/**len** must meet:
|
||||
- The start address must be in an allocated VMA.
|
||||
- The start address must be page aligned.
|
||||
- The end address (**addr** + **len**) must be in an allocated VMA.
|
||||
- no gap (unallocated memory) between start and end address.
|
||||
|
||||
**addr/len**: virtual memory address range.
|
||||
The ``len`` will be paged aligned implicitly by the kernel.
|
||||
|
||||
The address range set by ``addr``/``len`` must meet:
|
||||
- The start address must be in an allocated VMA.
|
||||
- The start address must be page aligned.
|
||||
- The end address (``addr`` + ``len``) must be in an allocated VMA.
|
||||
- no gap (unallocated memory) between start and end address.
|
||||
**flags**: reserved for future use.
|
||||
|
||||
The ``len`` will be paged aligned implicitly by the kernel.
|
||||
**Return values**:
|
||||
- **0**: Success.
|
||||
- **-EINVAL**:
|
||||
* Invalid input ``flags``.
|
||||
* The start address (``addr``) is not page aligned.
|
||||
* Address range (``addr`` + ``len``) overflow.
|
||||
- **-ENOMEM**:
|
||||
* The start address (``addr``) is not allocated.
|
||||
* The end address (``addr`` + ``len``) is not allocated.
|
||||
* A gap (unallocated memory) between start and end address.
|
||||
- **-EPERM**:
|
||||
* sealing is supported only on 64-bit CPUs, 32-bit is not supported.
|
||||
|
||||
**flags**: reserved for future use.
|
||||
**Note about error return**:
|
||||
- For above error cases, users can expect the given memory range is
|
||||
unmodified, i.e. no partial update.
|
||||
- There might be other internal errors/cases not listed here, e.g.
|
||||
error during merging/splitting VMAs, or the process reaching the maximum
|
||||
number of supported VMAs. In those cases, partial updates to the given
|
||||
memory range could happen. However, those cases should be rare.
|
||||
|
||||
**return values**:
|
||||
**Architecture support**:
|
||||
mseal only works on 64-bit CPUs, not 32-bit CPUs.
|
||||
|
||||
- ``0``: Success.
|
||||
**Idempotent**:
|
||||
users can call mseal multiple times. mseal on an already sealed memory
|
||||
is a no-action (not error).
|
||||
|
||||
- ``-EINVAL``:
|
||||
- Invalid input ``flags``.
|
||||
- The start address (``addr``) is not page aligned.
|
||||
- Address range (``addr`` + ``len``) overflow.
|
||||
**no munseal**
|
||||
Once mapping is sealed, it can't be unsealed. The kernel should never
|
||||
have munseal, this is consistent with other sealing feature, e.g.
|
||||
F_SEAL_SEAL for file.
|
||||
|
||||
- ``-ENOMEM``:
|
||||
- The start address (``addr``) is not allocated.
|
||||
- The end address (``addr`` + ``len``) is not allocated.
|
||||
- A gap (unallocated memory) between start and end address.
|
||||
Blocked mm syscall for sealed mapping
|
||||
-------------------------------------
|
||||
It might be important to note: **once the mapping is sealed, it will
|
||||
stay in the process's memory until the process terminates**.
|
||||
|
||||
- ``-EPERM``:
|
||||
- sealing is supported only on 64-bit CPUs, 32-bit is not supported.
|
||||
Example::
|
||||
|
||||
- For above error cases, users can expect the given memory range is
|
||||
unmodified, i.e. no partial update.
|
||||
*ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
|
||||
rc = mseal(ptr, 4096, 0);
|
||||
/* munmap will fail */
|
||||
rc = munmap(ptr, 4096);
|
||||
assert(rc < 0);
|
||||
|
||||
- There might be other internal errors/cases not listed here, e.g.
|
||||
error during merging/splitting VMAs, or the process reaching the max
|
||||
number of supported VMAs. In those cases, partial updates to the given
|
||||
memory range could happen. However, those cases should be rare.
|
||||
Blocked mm syscall:
|
||||
- munmap
|
||||
- mmap
|
||||
- mremap
|
||||
- mprotect and pkey_mprotect
|
||||
- some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE,
|
||||
MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK
|
||||
|
||||
**Blocked operations after sealing**:
|
||||
Unmapping, moving to another location, and shrinking the size,
|
||||
via munmap() and mremap(), can leave an empty space, therefore
|
||||
can be replaced with a VMA with a new set of attributes.
|
||||
The first set of syscalls to block is munmap, mremap, mmap. They can
|
||||
either leave an empty space in the address space, therefore allowing
|
||||
replacement with a new mapping with new set of attributes, or can
|
||||
overwrite the existing mapping with another mapping.
|
||||
|
||||
Moving or expanding a different VMA into the current location,
|
||||
via mremap().
|
||||
mprotect and pkey_mprotect are blocked because they changes the
|
||||
protection bits (RWX) of the mapping.
|
||||
|
||||
Modifying a VMA via mmap(MAP_FIXED).
|
||||
Certain destructive madvise behaviors, specifically MADV_DONTNEED,
|
||||
MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce
|
||||
risks when applied to anonymous memory by threads lacking write
|
||||
permissions. Consequently, these operations are prohibited under such
|
||||
conditions. The aforementioned behaviors have the potential to modify
|
||||
region contents by discarding pages, effectively performing a memset(0)
|
||||
operation on the anonymous memory.
|
||||
|
||||
Size expansion, via mremap(), does not appear to pose any
|
||||
specific risks to sealed VMAs. It is included anyway because
|
||||
the use case is unclear. In any case, users can rely on
|
||||
merging to expand a sealed VMA.
|
||||
Kernel will return -EPERM for blocked syscalls.
|
||||
|
||||
mprotect() and pkey_mprotect().
|
||||
When blocked syscall return -EPERM due to sealing, the memory regions may
|
||||
or may not be changed, depends on the syscall being blocked:
|
||||
|
||||
Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
|
||||
for anonymous memory, when users don't have write permission to the
|
||||
memory. Those behaviors can alter region contents by discarding pages,
|
||||
effectively a memset(0) for anonymous memory.
|
||||
- munmap: munmap is atomic. If one of VMAs in the given range is
|
||||
sealed, none of VMAs are updated.
|
||||
- mprotect, pkey_mprotect, madvise: partial update might happen, e.g.
|
||||
when mprotect over multiple VMAs, mprotect might update the beginning
|
||||
VMAs before reaching the sealed VMA and return -EPERM.
|
||||
- mmap and mremap: undefined behavior.
|
||||
|
||||
Kernel will return -EPERM for blocked operations.
|
||||
|
||||
For blocked operations, one can expect the given address is unmodified,
|
||||
i.e. no partial update. Note, this is different from existing mm
|
||||
system call behaviors, where partial updates are made till an error is
|
||||
found and returned to userspace. To give an example:
|
||||
|
||||
Assume following code sequence:
|
||||
|
||||
- ptr = mmap(null, 8192, PROT_NONE);
|
||||
- munmap(ptr + 4096, 4096);
|
||||
- ret1 = mprotect(ptr, 8192, PROT_READ);
|
||||
- mseal(ptr, 4096);
|
||||
- ret2 = mprotect(ptr, 8192, PROT_NONE);
|
||||
|
||||
ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
|
||||
|
||||
ret2 will be -EPERM, the page remains to be PROT_READ.
|
||||
|
||||
**Note**:
|
||||
|
||||
- mseal() only works on 64-bit CPUs, not 32-bit CPU.
|
||||
|
||||
- users can call mseal() multiple times, mseal() on an already sealed memory
|
||||
is a no-action (not error).
|
||||
|
||||
- munseal() is not supported.
|
||||
|
||||
Use cases:
|
||||
==========
|
||||
Use cases
|
||||
=========
|
||||
- glibc:
|
||||
The dynamic linker, during loading ELF executables, can apply sealing to
|
||||
non-writable memory segments.
|
||||
mapping segments.
|
||||
|
||||
- Chrome browser: protect some security sensitive data-structures.
|
||||
- Chrome browser: protect some security sensitive data structures.
|
||||
|
||||
Notes on which memory to seal:
|
||||
==============================
|
||||
|
||||
It might be important to note that sealing changes the lifetime of a mapping,
|
||||
i.e. the sealed mapping won’t be unmapped till the process terminates or the
|
||||
exec system call is invoked. Applications can apply sealing to any virtual
|
||||
memory region from userspace, but it is crucial to thoroughly analyze the
|
||||
mapping's lifetime prior to apply the sealing.
|
||||
When not to use mseal
|
||||
=====================
|
||||
Applications can apply sealing to any virtual memory region from userspace,
|
||||
but it is *crucial to thoroughly analyze the mapping's lifetime* prior to
|
||||
apply the sealing. This is because the sealed mapping *won’t be unmapped*
|
||||
until the process terminates or the exec system call is invoked.
|
||||
|
||||
For example:
|
||||
- aio/shm
|
||||
aio/shm can call mmap and munmap on behalf of userspace, e.g.
|
||||
ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to
|
||||
the lifetime of the process. If those memories are sealed from userspace,
|
||||
then munmap will fail, causing leaks in VMA address space during the
|
||||
lifetime of the process.
|
||||
|
||||
- aio/shm
|
||||
- ptr allocated by malloc (heap)
|
||||
Don't use mseal on the memory ptr return from malloc().
|
||||
malloc() is implemented by allocator, e.g. by glibc. Heap manager might
|
||||
allocate a ptr from brk or mapping created by mmap.
|
||||
If an app calls mseal on a ptr returned from malloc(), this can affect
|
||||
the heap manager's ability to manage the mappings; the outcome is
|
||||
non-deterministic.
|
||||
|
||||
aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
|
||||
shm.c. The lifetime of those mapping are not tied to the lifetime of the
|
||||
process. If those memories are sealed from userspace, then munmap() will fail,
|
||||
causing leaks in VMA address space during the lifetime of the process.
|
||||
Example::
|
||||
|
||||
- Brk (heap)
|
||||
ptr = malloc(size);
|
||||
/* don't call mseal on ptr return from malloc. */
|
||||
mseal(ptr, size);
|
||||
/* free will success, allocator can't shrink heap lower than ptr */
|
||||
free(ptr);
|
||||
|
||||
Currently, userspace applications can seal parts of the heap by calling
|
||||
malloc() and mseal().
|
||||
let's assume following calls from user space:
|
||||
mseal doesn't block
|
||||
===================
|
||||
In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's
|
||||
attributes, such as protection bits (RWX). Sealed mappings doesn't mean the
|
||||
memory is immutable.
|
||||
|
||||
- ptr = malloc(size);
|
||||
- mprotect(ptr, size, RO);
|
||||
- mseal(ptr, size);
|
||||
- free(ptr);
|
||||
|
||||
Technically, before mseal() is added, the user can change the protection of
|
||||
the heap by calling mprotect(RO). As long as the user changes the protection
|
||||
back to RW before free(), the memory range can be reused.
|
||||
|
||||
Adding mseal() into the picture, however, the heap is then sealed partially,
|
||||
the user can still free it, but the memory remains to be RO. If the address
|
||||
is re-used by the heap manager for another malloc, the process might crash
|
||||
soon after. Therefore, it is important not to apply sealing to any memory
|
||||
that might get recycled.
|
||||
|
||||
Furthermore, even if the application never calls the free() for the ptr,
|
||||
the heap manager may invoke the brk system call to shrink the size of the
|
||||
heap. In the kernel, the brk-shrink will call munmap(). Consequently,
|
||||
depending on the location of the ptr, the outcome of brk-shrink is
|
||||
nondeterministic.
|
||||
|
||||
|
||||
Additional notes:
|
||||
=================
|
||||
As Jann Horn pointed out in [3], there are still a few ways to write
|
||||
to RO memory, which is, in a way, by design. Those cases are not covered
|
||||
by mseal(). If applications want to block such cases, sandbox tools (such as
|
||||
seccomp, LSM, etc) might be considered.
|
||||
to RO memory, which is, in a way, by design. And those could be blocked
|
||||
by different security measures.
|
||||
|
||||
Those cases are:
|
||||
|
||||
- Write to read-only memory through /proc/self/mem interface.
|
||||
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
|
||||
- userfaultfd.
|
||||
- Write to read-only memory through /proc/self/mem interface (FOLL_FORCE).
|
||||
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
|
||||
- userfaultfd.
|
||||
|
||||
The idea that inspired this patch comes from Stephen Röttger’s work in V8
|
||||
CFI [4]. Chrome browser in ChromeOS will be the first user of this API.
|
||||
|
||||
Reference:
|
||||
==========
|
||||
[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
|
||||
|
||||
[2] https://man.openbsd.org/mimmutable.2
|
||||
|
||||
[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
|
||||
|
||||
[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
|
||||
Reference
|
||||
=========
|
||||
- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
|
||||
- [2] https://man.openbsd.org/mimmutable.2
|
||||
- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
|
||||
- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
|
||||
|
|
|
@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs)
|
|||
int ud_type;
|
||||
u32 imm;
|
||||
|
||||
/*
|
||||
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
|
||||
* is a rare case that uses @regs without passing them to
|
||||
* irqentry_enter().
|
||||
*/
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
ud_type = decode_bug(regs->ip, &imm);
|
||||
if (ud_type == BUG_NONE)
|
||||
return handled;
|
||||
|
@ -275,6 +269,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
|
|||
* All lies, just get the WARN/BUG out.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
/*
|
||||
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
|
||||
* is a rare case that uses @regs without passing them to
|
||||
* irqentry_enter().
|
||||
*/
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
/*
|
||||
* Since we're emulating a CALL with exceptions, restore the interrupt
|
||||
* state to what it was at the exception site.
|
||||
|
|
|
@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
|
|||
|
||||
folio_clear_uptodate(folio);
|
||||
folio_clear_mappedtodisk(folio);
|
||||
folio_clear_checked(folio);
|
||||
|
||||
head = folio_buffers(folio);
|
||||
if (head) {
|
||||
|
|
|
@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
|
|||
return 0;
|
||||
|
||||
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
||||
int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
|
||||
|
||||
if (byte_start > id_count || byte_start + byte_len > id_count) {
|
||||
ret = -EINVAL;
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
|
||||
byte_start + byte_len, 0);
|
||||
if (ret) {
|
||||
|
|
|
@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
|
|||
}
|
||||
}
|
||||
|
||||
void dup_userfaultfd_fail(struct list_head *fcs)
|
||||
{
|
||||
struct userfaultfd_fork_ctx *fctx, *n;
|
||||
|
||||
/*
|
||||
* An error has occurred on fork, we will tear memory down, but have
|
||||
* allocated memory for fctx's and raised reference counts for both the
|
||||
* original and child contexts (and on the mm for each as a result).
|
||||
*
|
||||
* These would ordinarily be taken care of by a user handling the event,
|
||||
* but we are no longer doing so, so manually clean up here.
|
||||
*
|
||||
* mm tear down will take care of cleaning up VMA contexts.
|
||||
*/
|
||||
list_for_each_entry_safe(fctx, n, fcs, list) {
|
||||
struct userfaultfd_ctx *octx = fctx->orig;
|
||||
struct userfaultfd_ctx *ctx = fctx->new;
|
||||
|
||||
atomic_dec(&octx->mmap_changing);
|
||||
VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
|
||||
userfaultfd_ctx_put(octx);
|
||||
userfaultfd_ctx_put(ctx);
|
||||
|
||||
list_del(&fctx->list);
|
||||
kfree(fctx);
|
||||
}
|
||||
}
|
||||
|
||||
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
||||
struct vm_userfaultfd_ctx *vm_ctx)
|
||||
{
|
||||
|
|
|
@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
|
|||
return atomic_long_read(&mm->ksm_zero_pages);
|
||||
}
|
||||
|
||||
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
{
|
||||
/* Adding mm to ksm is best effort on fork. */
|
||||
if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
|
||||
return __ksm_enter(mm);
|
||||
|
||||
return 0;
|
||||
__ksm_enter(mm);
|
||||
}
|
||||
|
||||
static inline int ksm_execve(struct mm_struct *mm)
|
||||
|
@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int ksm_execve(struct mm_struct *mm)
|
||||
|
|
|
@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
|
|||
|
||||
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
|
||||
extern void dup_userfaultfd_complete(struct list_head *);
|
||||
void dup_userfaultfd_fail(struct list_head *);
|
||||
|
||||
extern void mremap_userfaultfd_prep(struct vm_area_struct *,
|
||||
struct vm_userfaultfd_ctx *);
|
||||
|
@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
|
|||
{
|
||||
}
|
||||
|
||||
static inline void dup_userfaultfd_fail(struct list_head *l)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
||||
struct vm_userfaultfd_ctx *ctx)
|
||||
{
|
||||
|
|
|
@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
mm->exec_vm = oldmm->exec_vm;
|
||||
mm->stack_vm = oldmm->stack_vm;
|
||||
|
||||
retval = ksm_fork(mm, oldmm);
|
||||
if (retval)
|
||||
goto out;
|
||||
khugepaged_fork(mm, oldmm);
|
||||
|
||||
/* Use __mt_dup() to efficiently build an identical maple tree. */
|
||||
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
|
||||
if (unlikely(retval))
|
||||
|
@ -760,6 +755,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
vma_iter_free(&vmi);
|
||||
if (!retval) {
|
||||
mt_set_in_rcu(vmi.mas.tree);
|
||||
ksm_fork(mm, oldmm);
|
||||
khugepaged_fork(mm, oldmm);
|
||||
} else if (mpnt) {
|
||||
/*
|
||||
* The entire maple tree has already been duplicated. If the
|
||||
|
@ -775,7 +772,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
mmap_write_unlock(mm);
|
||||
flush_tlb_mm(oldmm);
|
||||
mmap_write_unlock(oldmm);
|
||||
dup_userfaultfd_complete(&uf);
|
||||
if (!retval)
|
||||
dup_userfaultfd_complete(&uf);
|
||||
else
|
||||
dup_userfaultfd_fail(&uf);
|
||||
fail_uprobe_end:
|
||||
uprobe_end_dup_mmap();
|
||||
return retval;
|
||||
|
|
|
@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
|
|||
rams_size += 16;
|
||||
}
|
||||
|
||||
rams[i].start = res.start;
|
||||
rams[i++].end = res.end;
|
||||
|
||||
rams[i++] = res;
|
||||
start = res.end + 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -1085,7 +1085,6 @@ config HMM_MIRROR
|
|||
depends on MMU
|
||||
|
||||
config GET_FREE_REGION
|
||||
depends on SPARSEMEM
|
||||
bool
|
||||
|
||||
config DEVICE_PRIVATE
|
||||
|
|
15
mm/memory.c
15
mm/memory.c
|
@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
|
|||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
|
||||
|
||||
/*
|
||||
* We enter with non-exclusive mmap_lock (to exclude vma changes,
|
||||
* but allow concurrent faults), and pte mapped but not yet locked.
|
||||
|
@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
{
|
||||
struct vm_area_struct *vma = vmf->vma;
|
||||
struct folio *swapcache, *folio = NULL;
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
struct page *page;
|
||||
struct swap_info_struct *si = NULL;
|
||||
rmap_t rmap_flags = RMAP_NONE;
|
||||
|
@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
* Relax a bit to prevent rapid
|
||||
* repeated page faults.
|
||||
*/
|
||||
add_wait_queue(&swapcache_wq, &wait);
|
||||
schedule_timeout_uninterruptible(1);
|
||||
remove_wait_queue(&swapcache_wq, &wait);
|
||||
goto out_page;
|
||||
}
|
||||
need_clear_cache = true;
|
||||
|
@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
pte_unmap_unlock(vmf->pte, vmf->ptl);
|
||||
out:
|
||||
/* Clear the swap cache pin for direct swapin after PTL unlock */
|
||||
if (need_clear_cache)
|
||||
if (need_clear_cache) {
|
||||
swapcache_clear(si, entry, nr_pages);
|
||||
if (waitqueue_active(&swapcache_wq))
|
||||
wake_up(&swapcache_wq);
|
||||
}
|
||||
if (si)
|
||||
put_swap_device(si);
|
||||
return ret;
|
||||
|
@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|||
folio_unlock(swapcache);
|
||||
folio_put(swapcache);
|
||||
}
|
||||
if (need_clear_cache)
|
||||
if (need_clear_cache) {
|
||||
swapcache_clear(si, entry, nr_pages);
|
||||
if (waitqueue_active(&swapcache_wq))
|
||||
wake_up(&swapcache_wq);
|
||||
}
|
||||
if (si)
|
||||
put_swap_device(si);
|
||||
return ret;
|
||||
|
|
84
mm/mmap.c
84
mm/mmap.c
|
@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
|||
vmg.flags = vm_flags;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear PTEs while the vma is still in the tree so that rmap
|
||||
* cannot race with the freeing later in the truncate scenario.
|
||||
* This is also needed for call_mmap(), which is why vm_ops
|
||||
* close function is called.
|
||||
*/
|
||||
vms_clean_up_area(&vms, &mas_detach);
|
||||
vma = vma_merge_new_range(&vmg);
|
||||
if (vma)
|
||||
goto expanded;
|
||||
|
@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
|||
|
||||
if (file) {
|
||||
vma->vm_file = get_file(file);
|
||||
/*
|
||||
* call_mmap() may map PTE, so ensure there are no existing PTEs
|
||||
* and call the vm_ops close function if one exists.
|
||||
*/
|
||||
vms_clean_up_area(&vms, &mas_detach);
|
||||
error = call_mmap(file, vma);
|
||||
if (error)
|
||||
goto unmap_and_free_vma;
|
||||
|
@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
unsigned long populate = 0;
|
||||
unsigned long ret = -EINVAL;
|
||||
struct file *file;
|
||||
vm_flags_t vm_flags;
|
||||
|
||||
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
|
||||
current->comm, current->pid);
|
||||
|
@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
|
||||
return ret;
|
||||
|
||||
if (mmap_write_lock_killable(mm))
|
||||
if (mmap_read_lock_killable(mm))
|
||||
return -EINTR;
|
||||
|
||||
/*
|
||||
* Look up VMA under read lock first so we can perform the security
|
||||
* without holding locks (which can be problematic). We reacquire a
|
||||
* write lock later and check nothing changed underneath us.
|
||||
*/
|
||||
vma = vma_lookup(mm, start);
|
||||
|
||||
if (!vma || !(vma->vm_flags & VM_SHARED)) {
|
||||
mmap_read_unlock(mm);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
|
||||
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
|
||||
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
|
||||
|
||||
flags &= MAP_NONBLOCK;
|
||||
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
flags |= MAP_LOCKED;
|
||||
|
||||
/* Save vm_flags used to calculate prot and flags, and recheck later. */
|
||||
vm_flags = vma->vm_flags;
|
||||
file = get_file(vma->vm_file);
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
/* Call outside mmap_lock to be consistent with other callers. */
|
||||
ret = security_mmap_file(file, prot, flags);
|
||||
if (ret) {
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = -EINVAL;
|
||||
|
||||
/* OK security check passed, take write lock + let it rip. */
|
||||
if (mmap_write_lock_killable(mm)) {
|
||||
fput(file);
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
vma = vma_lookup(mm, start);
|
||||
|
||||
if (!vma || !(vma->vm_flags & VM_SHARED))
|
||||
if (!vma)
|
||||
goto out;
|
||||
|
||||
/* Make sure things didn't change under us. */
|
||||
if (vma->vm_flags != vm_flags)
|
||||
goto out;
|
||||
if (vma->vm_file != file)
|
||||
goto out;
|
||||
|
||||
if (start + size > vma->vm_end) {
|
||||
|
@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
goto out;
|
||||
}
|
||||
|
||||
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
|
||||
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
|
||||
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
|
||||
|
||||
flags &= MAP_NONBLOCK;
|
||||
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
flags |= MAP_LOCKED;
|
||||
|
||||
file = get_file(vma->vm_file);
|
||||
ret = security_mmap_file(vma->vm_file, prot, flags);
|
||||
if (ret)
|
||||
goto out_fput;
|
||||
ret = do_mmap(vma->vm_file, start, size,
|
||||
prot, flags, 0, pgoff, &populate, NULL);
|
||||
out_fput:
|
||||
fput(file);
|
||||
out:
|
||||
mmap_write_unlock(mm);
|
||||
fput(file);
|
||||
if (populate)
|
||||
mm_populate(ret, populate);
|
||||
if (!IS_ERR_VALUE(ret))
|
||||
|
@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|||
VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
|
||||
|
||||
vmg.prev = vma;
|
||||
vma_iter_next_range(vmi);
|
||||
/* vmi is positioned at prev, which this mode expects. */
|
||||
vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
|
||||
|
||||
if (vma_merge_new_range(&vmg))
|
||||
goto out;
|
||||
|
|
|
@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
|
|||
for_each_reserved_mem_region(mb_region) {
|
||||
int nid = memblock_get_region_node(mb_region);
|
||||
|
||||
if (nid != MAX_NUMNODES)
|
||||
if (numa_valid_node(nid))
|
||||
node_set(nid, reserved_nodemask);
|
||||
}
|
||||
|
||||
|
|
|
@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
|
|||
page = __rmqueue(zone, order, migratetype, alloc_flags);
|
||||
|
||||
/*
|
||||
* If the allocation fails, allow OOM handling access
|
||||
* to HIGHATOMIC reserves as failing now is worse than
|
||||
* failing a high-order atomic allocation in the
|
||||
* future.
|
||||
* If the allocation fails, allow OOM handling and
|
||||
* order-0 (atomic) allocs access to HIGHATOMIC
|
||||
* reserves as failing now is worse than failing a
|
||||
* high-order atomic allocation in the future.
|
||||
*/
|
||||
if (!page && (alloc_flags & ALLOC_OOM))
|
||||
if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
|
||||
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
|
||||
|
||||
if (!page) {
|
||||
|
|
|
@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
pud = pudp_get(pudp);
|
||||
if (pud_none(pud))
|
||||
goto not_found;
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
|
||||
(!pud_present(pud) || pud_leaf(pud))) {
|
||||
ptl = pud_lock(vma->vm_mm, pudp);
|
||||
pud = pudp_get(pudp);
|
||||
|
||||
|
@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
fw->pudp = pudp;
|
||||
fw->pud = pud;
|
||||
|
||||
/*
|
||||
* TODO: FW_MIGRATION support for PUD migration entries
|
||||
* once there are relevant users.
|
||||
*/
|
||||
if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
|
||||
spin_unlock(ptl);
|
||||
goto not_found;
|
||||
|
@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
}
|
||||
|
||||
pmd_table:
|
||||
VM_WARN_ON_ONCE(pud_leaf(*pudp));
|
||||
VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
|
||||
pmdp = pmd_offset(pudp, addr);
|
||||
pmd = pmdp_get_lockless(pmdp);
|
||||
if (pmd_none(pmd))
|
||||
goto not_found;
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
|
||||
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
|
||||
(!pmd_present(pmd) || pmd_leaf(pmd))) {
|
||||
ptl = pmd_lock(vma->vm_mm, pmdp);
|
||||
pmd = pmdp_get(pmdp);
|
||||
|
||||
|
@ -786,7 +792,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
if (pmd_none(pmd)) {
|
||||
spin_unlock(ptl);
|
||||
goto not_found;
|
||||
} else if (!pmd_leaf(pmd)) {
|
||||
} else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
|
||||
spin_unlock(ptl);
|
||||
goto pte_table;
|
||||
} else if (pmd_present(pmd)) {
|
||||
|
@ -812,7 +818,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
|
|||
}
|
||||
|
||||
pte_table:
|
||||
VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
|
||||
VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
|
||||
ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
|
||||
if (!ptep)
|
||||
goto not_found;
|
||||
|
|
|
@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
|
|||
stat->attributes_mask |= (STATX_ATTR_APPEND |
|
||||
STATX_ATTR_IMMUTABLE |
|
||||
STATX_ATTR_NODUMP);
|
||||
inode_lock_shared(inode);
|
||||
generic_fillattr(idmap, request_mask, inode, stat);
|
||||
inode_unlock_shared(inode);
|
||||
|
||||
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
|
||||
stat->blksize = HPAGE_PMD_SIZE;
|
||||
|
|
23
mm/vma.c
23
mm/vma.c
|
@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
pgoff_t pgoff = vmg->pgoff;
|
||||
pgoff_t pglen = PHYS_PFN(end - start);
|
||||
bool can_merge_left, can_merge_right;
|
||||
bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
|
||||
|
||||
mmap_assert_write_locked(vmg->mm);
|
||||
VM_WARN_ON(vmg->vma);
|
||||
|
@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
return NULL;
|
||||
|
||||
can_merge_left = can_vma_merge_left(vmg);
|
||||
can_merge_right = can_vma_merge_right(vmg, can_merge_left);
|
||||
can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
|
||||
|
||||
/* If we can merge with the next VMA, adjust vmg accordingly. */
|
||||
if (can_merge_right) {
|
||||
|
@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
if (can_merge_right && !can_merge_remove_vma(next))
|
||||
vmg->end = end;
|
||||
|
||||
vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
|
||||
/* In expand-only case we are already positioned at prev. */
|
||||
if (!just_expand) {
|
||||
/* Equivalent to going to the previous range. */
|
||||
vma_prev(vmg->vmi);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
|
|||
}
|
||||
|
||||
/* If expansion failed, reset state. Allows us to retry merge later. */
|
||||
vmg->vma = NULL;
|
||||
vmg->start = start;
|
||||
vmg->end = end;
|
||||
vmg->pgoff = pgoff;
|
||||
if (vmg->vma == prev)
|
||||
vma_iter_set(vmg->vmi, start);
|
||||
if (!just_expand) {
|
||||
vmg->vma = NULL;
|
||||
vmg->start = start;
|
||||
vmg->end = end;
|
||||
vmg->pgoff = pgoff;
|
||||
if (vmg->vma == prev)
|
||||
vma_iter_set(vmg->vmi, start);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
|
26
mm/vma.h
26
mm/vma.h
|
@ -59,6 +59,17 @@ enum vma_merge_state {
|
|||
VMA_MERGE_SUCCESS,
|
||||
};
|
||||
|
||||
enum vma_merge_flags {
|
||||
VMG_FLAG_DEFAULT = 0,
|
||||
/*
|
||||
* If we can expand, simply do so. We know there is nothing to merge to
|
||||
* the right. Does not reset state upon failure to merge. The VMA
|
||||
* iterator is assumed to be positioned at the previous VMA, rather than
|
||||
* at the gap.
|
||||
*/
|
||||
VMG_FLAG_JUST_EXPAND = 1 << 0,
|
||||
};
|
||||
|
||||
/* Represents a VMA merge operation. */
|
||||
struct vma_merge_struct {
|
||||
struct mm_struct *mm;
|
||||
|
@ -75,6 +86,7 @@ struct vma_merge_struct {
|
|||
struct mempolicy *policy;
|
||||
struct vm_userfaultfd_ctx uffd_ctx;
|
||||
struct anon_vma_name *anon_name;
|
||||
enum vma_merge_flags merge_flags;
|
||||
enum vma_merge_state state;
|
||||
};
|
||||
|
||||
|
@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
|
|||
.flags = flags_, \
|
||||
.pgoff = pgoff_, \
|
||||
.state = VMA_MERGE_START, \
|
||||
.merge_flags = VMG_FLAG_DEFAULT, \
|
||||
}
|
||||
|
||||
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
|
||||
|
@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
|
|||
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
|
||||
.anon_name = anon_vma_name(vma_), \
|
||||
.state = VMA_MERGE_START, \
|
||||
.merge_flags = VMG_FLAG_DEFAULT, \
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
|
||||
|
@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
|
|||
* failure method of leaving a gap where the MAP_FIXED mapping failed.
|
||||
*/
|
||||
mas_set_range(mas, vms->start, vms->end - 1);
|
||||
if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
|
||||
pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
|
||||
current->comm, current->pid);
|
||||
/* Leaving vmas detached and in-tree may hamper recovery */
|
||||
reattach_vmas(mas_detach);
|
||||
} else {
|
||||
/* Clean up the insertion of the unfortunate gap */
|
||||
vms_complete_munmap_vmas(vms, mas_detach);
|
||||
}
|
||||
mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
|
||||
/* Clean up the insertion of the unfortunate gap */
|
||||
vms_complete_munmap_vmas(vms, mas_detach);
|
||||
}
|
||||
|
||||
int
|
||||
|
|
|
@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
|
|||
unsigned long long *count_verify;
|
||||
uffd_test_ops_t *uffd_test_ops;
|
||||
uffd_test_case_ops_t *uffd_test_case_ops;
|
||||
pthread_barrier_t ready_for_fork;
|
||||
atomic_bool ready_for_fork;
|
||||
|
||||
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
|
||||
{
|
||||
|
@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
|
|||
pollfd[1].fd = pipefd[cpu*2];
|
||||
pollfd[1].events = POLLIN;
|
||||
|
||||
/* Ready for parent thread to fork */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
ready_for_fork = true;
|
||||
|
||||
for (;;) {
|
||||
ret = poll(pollfd, 2, -1);
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/random.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
#include "vm_util.h"
|
||||
|
@ -104,7 +105,7 @@ extern bool map_shared;
|
|||
extern bool test_uffdio_wp;
|
||||
extern unsigned long long *count_verify;
|
||||
extern volatile bool test_uffdio_copy_eexist;
|
||||
extern pthread_barrier_t ready_for_fork;
|
||||
extern atomic_bool ready_for_fork;
|
||||
|
||||
extern uffd_test_ops_t anon_uffd_test_ops;
|
||||
extern uffd_test_ops_t shmem_uffd_test_ops;
|
||||
|
|
|
@ -241,8 +241,7 @@ static void *fork_event_consumer(void *data)
|
|||
fork_event_args *args = data;
|
||||
struct uffd_msg msg = { 0 };
|
||||
|
||||
/* Ready for parent thread to fork */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
ready_for_fork = true;
|
||||
|
||||
/* Read until a full msg received */
|
||||
while (uffd_read_msg(args->parent_uffd, &msg));
|
||||
|
@ -311,12 +310,11 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
|
|||
|
||||
/* Prepare a thread to resolve EVENT_FORK */
|
||||
if (with_event) {
|
||||
pthread_barrier_init(&ready_for_fork, NULL, 2);
|
||||
ready_for_fork = false;
|
||||
if (pthread_create(&thread, NULL, fork_event_consumer, &args))
|
||||
err("pthread_create()");
|
||||
/* Wait for child thread to start before forking */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
pthread_barrier_destroy(&ready_for_fork);
|
||||
while (!ready_for_fork)
|
||||
; /* Wait for the poll_thread to start executing before forking */
|
||||
}
|
||||
|
||||
child = fork();
|
||||
|
@ -781,7 +779,7 @@ static void uffd_sigbus_test_common(bool wp)
|
|||
char c;
|
||||
struct uffd_args args = { 0 };
|
||||
|
||||
pthread_barrier_init(&ready_for_fork, NULL, 2);
|
||||
ready_for_fork = false;
|
||||
|
||||
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
|
||||
|
||||
|
@ -798,9 +796,8 @@ static void uffd_sigbus_test_common(bool wp)
|
|||
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
|
||||
err("uffd_poll_thread create");
|
||||
|
||||
/* Wait for child thread to start before forking */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
pthread_barrier_destroy(&ready_for_fork);
|
||||
while (!ready_for_fork)
|
||||
; /* Wait for the poll_thread to start executing before forking */
|
||||
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
|
@ -841,7 +838,7 @@ static void uffd_events_test_common(bool wp)
|
|||
char c;
|
||||
struct uffd_args args = { 0 };
|
||||
|
||||
pthread_barrier_init(&ready_for_fork, NULL, 2);
|
||||
ready_for_fork = false;
|
||||
|
||||
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
|
||||
if (uffd_register(uffd, area_dst, nr_pages * page_size,
|
||||
|
@ -852,9 +849,8 @@ static void uffd_events_test_common(bool wp)
|
|||
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
|
||||
err("uffd_poll_thread create");
|
||||
|
||||
/* Wait for child thread to start before forking */
|
||||
pthread_barrier_wait(&ready_for_fork);
|
||||
pthread_barrier_destroy(&ready_for_fork);
|
||||
while (!ready_for_fork)
|
||||
; /* Wait for the poll_thread to start executing before forking */
|
||||
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
|
|
|
@ -1522,6 +1522,45 @@ static bool test_copy_vma(void)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool test_expand_only_mode(void)
|
||||
{
|
||||
unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
||||
struct mm_struct mm = {};
|
||||
VMA_ITERATOR(vmi, &mm, 0);
|
||||
struct vm_area_struct *vma_prev, *vma;
|
||||
VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5);
|
||||
|
||||
/*
|
||||
* Place a VMA prior to the one we're expanding so we assert that we do
|
||||
* not erroneously try to traverse to the previous VMA even though we
|
||||
* have, through the use of VMG_FLAG_JUST_EXPAND, indicated we do not
|
||||
* need to do so.
|
||||
*/
|
||||
alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
|
||||
|
||||
/*
|
||||
* We will be positioned at the prev VMA, but looking to expand to
|
||||
* 0x9000.
|
||||
*/
|
||||
vma_iter_set(&vmi, 0x3000);
|
||||
vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
|
||||
vmg.prev = vma_prev;
|
||||
vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
|
||||
|
||||
vma = vma_merge_new_range(&vmg);
|
||||
ASSERT_NE(vma, NULL);
|
||||
ASSERT_EQ(vma, vma_prev);
|
||||
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
||||
ASSERT_EQ(vma->vm_start, 0x3000);
|
||||
ASSERT_EQ(vma->vm_end, 0x9000);
|
||||
ASSERT_EQ(vma->vm_pgoff, 3);
|
||||
ASSERT_TRUE(vma_write_started(vma));
|
||||
ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
|
||||
|
||||
cleanup_mm(&mm, &vmi);
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int num_tests = 0, num_fail = 0;
|
||||
|
@ -1553,6 +1592,7 @@ int main(void)
|
|||
TEST(vmi_prealloc_fail);
|
||||
TEST(merge_extend);
|
||||
TEST(copy_vma);
|
||||
TEST(expand_only_mode);
|
||||
|
||||
#undef TEST
|
||||
|
||||
|
|
Loading…
Reference in a new issue