Compare commits

...

60 commits

Author SHA1 Message Date
Tyler
85494ae7a9
Merge 504eaa2570 into 0fc810ae3a 2024-10-30 18:52:55 -05:00
Linus Torvalds
0fc810ae3a x86/uaccess: Avoid barrier_nospec() in 64-bit copy_from_user()
The barrier_nospec() in 64-bit copy_from_user() is slow. Instead use
pointer masking to force the user pointer to all 1's for an invalid
address.

The kernel test robot reports a 2.6% improvement in the per_thread_ops
benchmark [1].

This is a variation on a patch originally by Josh Poimboeuf [2].

Link: https://lore.kernel.org/202410281344.d02c72a2-oliver.sang@intel.com [1]
Link: https://lore.kernel.org/5b887fe4c580214900e21f6c61095adf9a142735.1730166635.git.jpoimboe@kernel.org [2]
Tested-and-reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-10-30 11:38:10 -10:00
Linus Torvalds
14b7d43c5c perf tools fixes for v6.12: 2nd batch
- Update more header copies with the kernel sources, including const.h,
   msr-index.h, arm64's cputype.h, kvm's, bits.h and unaligned.h
 
 - The return from 'write' isn't a pid, fix cut'n'paste error in 'perf
   trace'.
 
 - Fix up the python binding build on architectures without
   HAVE_KVM_STAT_SUPPORT.
 
 - Add some more bounds checks to augmented_raw_syscalls.bpf.c (used to
   collect syscall pointer arguments in 'perf trace') to make the
   resulting bytecode to pass the kernel BPF verifier, allowing us to go
   back accepting clang 12.0.1 as the minimum version required for
   compiling BPF sources.
 
 - Add __NR_capget for x86 to fix a regression on running perf + intel PT
   (hw tracing) as non-root setting up the capabilities as described in
   https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html.
 
 - Fix missing syscalltbl in non-explicitly listed architectures, noticed
   on ARM 32-bit, that still needs a .tbl generator for the syscall
   id<->name tables, should be added for v6.13.
 
 - Handle 'perf test' failure when handling broken DWARF for ASM files.
 
 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQR2GiIUctdOfX2qHhGyPKLppCJ+JwUCZyKQVQAKCRCyPKLppCJ+
 JxZKAQCOU0YgvvQ0LH6PfB9uGqRC/zOEHp9CnXxTK17rpKD/iAD/YYvH97Rrfx2V
 H5FdoyK7OtFrkV8WhNcKMKHFfBMl8Ac=
 =XDkJ
 -----END PGP SIGNATURE-----

Merge tag 'perf-tools-fixes-for-v6.12-2-2024-10-30' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools fixes from Arnaldo Carvalho de Melo:

 - Update more header copies with the kernel sources, including const.h,
   msr-index.h, arm64's cputype.h, kvm's, bits.h and unaligned.h

 - The return from 'write' isn't a pid, fix cut'n'paste error in 'perf
   trace'

 - Fix up the python binding build on architectures without
   HAVE_KVM_STAT_SUPPORT

 - Add some more bounds checks to augmented_raw_syscalls.bpf.c (used to
   collect syscall pointer arguments in 'perf trace') to make the
   resulting bytecode to pass the kernel BPF verifier, allowing us to go
   back accepting clang 12.0.1 as the minimum version required for
   compiling BPF sources

 - Add __NR_capget for x86 to fix a regression on running perf + intel
   PT (hw tracing) as non-root setting up the capabilities as described
   in https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html

 - Fix missing syscalltbl in non-explicitly listed architectures,
   noticed on ARM 32-bit, that still needs a .tbl generator for the
   syscall id<->name tables, should be added for v6.13

 - Handle 'perf test' failure when handling broken DWARF for ASM files

* tag 'perf-tools-fixes-for-v6.12-2-2024-10-30' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools:
  perf cap: Add __NR_capget to arch/x86 unistd
  tools headers: Update the linux/unaligned.h copy with the kernel sources
  tools headers arm64: Sync arm64's cputype.h with the kernel sources
  tools headers: Synchronize {uapi/}linux/bits.h with the kernel sources
  tools arch x86: Sync the msr-index.h copy with the kernel sources
  perf python: Fix up the build on architectures without HAVE_KVM_STAT_SUPPORT
  perf test: Handle perftool-testsuite_probe failure due to broken DWARF
  tools headers UAPI: Sync kvm headers with the kernel sources
  perf trace: Fix non-listed archs in the syscalltbl routines
  perf build: Change the clang check back to 12.0.1
  perf trace augmented_raw_syscalls: Add more checks to pass the verifier
  perf trace augmented_raw_syscalls: Add extra array index bounds checking to satisfy some BPF verifiers
  perf trace: The return from 'write' isn't a pid
  tools headers UAPI: Sync linux/const.h with the kernel headers
2024-10-30 11:17:47 -10:00
Linus Torvalds
4236f91380 SCSI fixes on 20241030
Two small fixes, both in drivers (ufs and scsi_debug).
 
 Signed-off-by: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
 -----BEGIN PGP SIGNATURE-----
 
 iJwEABMIAEQWIQTnYEDbdso9F2cI+arnQslM7pishQUCZyH+cSYcamFtZXMuYm90
 dG9tbGV5QGhhbnNlbnBhcnRuZXJzaGlwLmNvbQAKCRDnQslM7pishVdMAQDdOiaS
 9DO+ly/Il64wXZqb9WKcVYRIjmz7m7g5xdMgrgEA1yfD6G7GgQ3zvbVPNC7Y9ecr
 4O2iR5EGAVb1Y7UaEQU=
 =551G
 -----END PGP SIGNATURE-----

Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi

Pull SCSI fixes from James Bottomley:
 "Two small fixes, both in drivers (ufs and scsi_debug)"

* tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi:
  scsi: ufs: core: Fix another deadlock during RTC update
  scsi: scsi_debug: Fix do_device_access() handling of unexpected SG copy length
2024-10-30 08:16:23 -10:00
Linus Torvalds
c1e939a21e cgroup: Fixes for v6.12-rc5
- cgroup_bpf_release_fn() could saturate system_wq with
   cgrp->bpf.release_work which can then form a circular dependency leading
   to deadlocks. Fix by using a dedicated workqueue. The system_wq's max
   concurrency limit is being increased separately.
 
 - Fix theoretical off-by-one bug when enforcing max cgroup hierarchy depth.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZyGCPA4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGS2MAQDmtRNBlDYl36fiLAsylU4Coz5P0Y4ISmtSWT+c
 zrEUZAD/WKSlCfy4RFngmnfkYbrJ+tWOVTMtsDqby8IzYLDGBw8=
 =glRQ
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-6.12-rc5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - cgroup_bpf_release_fn() could saturate system_wq with
   cgrp->bpf.release_work which can then form a circular dependency
   leading to deadlocks. Fix by using a dedicated workqueue. The
   system_wq's max concurrency limit is being increased separately.

 - Fix theoretical off-by-one bug when enforcing max cgroup hierarchy
   depth

* tag 'cgroup-for-6.12-rc5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: Fix potential overflow issue when checking max_depth
  cgroup/bpf: use a dedicated workqueue for cgroup bpf destruction
2024-10-29 16:41:30 -10:00
Linus Torvalds
daa9f66fe1 sched_ext: Fixes for v6.12-rc5
- Instances of scx_ops_bypass() could race each other leading to
   misbehavior. Fix by protecting the operation with a spinlock.
 
 - selftest and userspace header fixes.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZyF/5Q4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGRi+AP4+jGUz+O1LS0bCNj44Xlr0v6kci5dfJR7TlBv5
 hwROcgEA84i7nRq6oJ1IkK7ItLbZYwgZyxqdn0Pgsq+oMWhgAwE=
 =R766
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-6.12-rc5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Instances of scx_ops_bypass() could race each other leading to
   misbehavior. Fix by protecting the operation with a spinlock.

 - selftest and userspace header fixes

* tag 'sched_ext-for-6.12-rc5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Fix enq_last_no_enq_fails selftest
  sched_ext: Make cast_mask() inline
  scx: Fix raciness in scx_ops_bypass()
  scx: Fix exit selftest to use custom DSQ
  sched_ext: Fix function pointer type mismatches in BPF selftests
  selftests/sched_ext: add order-only dependency of runner.o on BPFOBJ
2024-10-29 16:35:40 -10:00
Linus Torvalds
7fbaacafbc slab fixes for 6.12-rc6
-----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEe7vIQRWZI0iWSE3xu+CwddJFiJoFAmcgrxcACgkQu+CwddJF
 iJrq9ggAiZ/2c7p23s52LdVhT9GTyV5omVOh2kDztVx4w6RM3RbkhkLWdqt0XUag
 uf1TJe6kOvnCeHEFEEo3sqPj820XebxKDf0GGCdI6a9f4n30ipKH+vWSQ0iutKO/
 dOBdArxr0FGOV5VZR9i3xQ6sUqZXXUbJdte0c0ovp6Q6HDHTeQeKNhOQ2fv33TG/
 7jBh5HVyhI6JE/+TOxrMaklH0IqYBb6z49wdbaN7XBvXVXlb5MtOZy109gfUHDwe
 tfktifyE45VtmF0WdHfxDbCnqyDSG1Jm3wsLDbMq+voJ1BQlUvIZ5Dv4kucYqffm
 VN5HkH6uQ09aoounBoU4g50UYeNpiQ==
 =xAw8
 -----END PGP SIGNATURE-----

Merge tag 'slab-for-6.12-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab fixes from Vlastimil Babka:

 - Fix for a slub_kunit test warning with MEM_ALLOC_PROFILING_DEBUG (Pei
   Xiao)

 - Fix for a MTE-based KASAN BUG in krealloc() (Qun-Wei Lin)

* tag 'slab-for-6.12-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab:
  mm: krealloc: Fix MTE false alarm in __do_krealloc
  slub/kunit: fix a WARNING due to unwrapped __kmalloc_cache_noprof
2024-10-29 16:24:02 -10:00
Linus Torvalds
9251e3e93c 21 hotfixes. 13 are cc:stable. 13 are MM and 8 are non-MM.
No particular theme here - mainly singletons, a couple of doubletons.
 Please see the changelogs.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZyBpwAAKCRDdBJ7gKXxA
 jt9XAPsEfjtMc6wtcII5zXLXbLbznnCenaX0bSOmAHMQsQS63QEAp/JTyjN1rBjm
 DExd7kbYx9ya61fnBLZ2WfEMm0Sbigc=
 =PIza
 -----END PGP SIGNATURE-----

Merge tag 'mm-hotfixes-stable-2024-10-28-21-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "21 hotfixes. 13 are cc:stable. 13 are MM and 8 are non-MM.

  No particular theme here - mainly singletons, a couple of doubletons.
  Please see the changelogs"

* tag 'mm-hotfixes-stable-2024-10-28-21-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (21 commits)
  mm: avoid unconditional one-tick sleep when swapcache_prepare fails
  mseal: update mseal.rst
  mm: split critical region in remap_file_pages() and invoke LSMs in between
  selftests/mm: fix deadlock for fork after pthread_create with atomic_bool
  Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
  Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
  tools: testing: add expand-only mode VMA test
  mm/vma: add expand-only VMA merge mode and optimise do_brk_flags()
  resource,kexec: walk_system_ram_res_rev must retain resource flags
  nilfs2: fix kernel bug due to missing clearing of checked flag
  mm: numa_clear_kernel_node_hotplug: Add NUMA_NO_NODE check for node id
  ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow
  mm: shmem: fix data-race in shmem_getattr()
  mm: mark mas allocation in vms_abort_munmap_vmas as __GFP_NOFAIL
  x86/traps: move kmsan check after instrumentation_begin
  resource: remove dependency on SPARSEMEM from GET_FREE_REGION
  mm/mmap: fix race in mmap_region() with ftruncate()
  mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves
  fork: only invoke khugepaged, ksm hooks if no error
  fork: do not invoke uffd on fork if error occurs
  ...
2024-10-29 16:19:15 -10:00
Linus Torvalds
d5b2ee0fe8 Hi
Addresses a significant boot-time delay issue:
 
 https://bugzilla.kernel.org/show_bug.cgi?id=219229
 
 BR, Jarkko
 -----BEGIN PGP SIGNATURE-----
 
 iIgEABYKADAWIQRE6pSOnaBC00OEHEIaerohdGur0gUCZyAhHxIcamFya2tvQGtl
 cm5lbC5vcmcACgkQGnq6IXRrq9IdzQEAw8uoorY2IHBLFFyvYPebfHAZq5rPci8x
 eu1606gzODsBAIddSR4tgzFfqm0JVeh5vBa85wIZ43rRbUcpscrgEN8B
 =J+AF
 -----END PGP SIGNATURE-----

Merge tag 'tpmdd-next-6.12-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd

Pull tpm fix from Jarkko Sakkinen:
 "Address a significant boot-time delay issue"

Link: https://bugzilla.kernel.org/show_bug.cgi?id=219229

* tag 'tpmdd-next-6.12-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd:
  tpm: Lazily flush the auth session
  tpm: Rollback tpm2_load_null()
  tpm: Return tpm2_sessions_init() when null key creation fails
2024-10-29 16:04:24 -10:00
Qun-Wei Lin
704573851b mm: krealloc: Fix MTE false alarm in __do_krealloc
This patch addresses an issue introduced by commit 1a83a716ec ("mm:
krealloc: consider spare memory for __GFP_ZERO") which causes MTE
(Memory Tagging Extension) to falsely report a slab-out-of-bounds error.

The problem occurs when zeroing out spare memory in __do_krealloc. The
original code only considered software-based KASAN and did not account
for MTE. It does not reset the KASAN tag before calling memset, leading
to a mismatch between the pointer tag and the memory tag, resulting
in a false positive.

Example of the error:
==================================================================
swapper/0: BUG: KASAN: slab-out-of-bounds in __memset+0x84/0x188
swapper/0: Write at addr f4ffff8005f0fdf0 by task swapper/0/1
swapper/0: Pointer tag: [f4], memory tag: [fe]
swapper/0:
swapper/0: CPU: 4 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.
swapper/0: Hardware name: MT6991(ENG) (DT)
swapper/0: Call trace:
swapper/0:  dump_backtrace+0xfc/0x17c
swapper/0:  show_stack+0x18/0x28
swapper/0:  dump_stack_lvl+0x40/0xa0
swapper/0:  print_report+0x1b8/0x71c
swapper/0:  kasan_report+0xec/0x14c
swapper/0:  __do_kernel_fault+0x60/0x29c
swapper/0:  do_bad_area+0x30/0xdc
swapper/0:  do_tag_check_fault+0x20/0x34
swapper/0:  do_mem_abort+0x58/0x104
swapper/0:  el1_abort+0x3c/0x5c
swapper/0:  el1h_64_sync_handler+0x80/0xcc
swapper/0:  el1h_64_sync+0x68/0x6c
swapper/0:  __memset+0x84/0x188
swapper/0:  btf_populate_kfunc_set+0x280/0x3d8
swapper/0:  __register_btf_kfunc_id_set+0x43c/0x468
swapper/0:  register_btf_kfunc_id_set+0x48/0x60
swapper/0:  register_nf_nat_bpf+0x1c/0x40
swapper/0:  nf_nat_init+0xc0/0x128
swapper/0:  do_one_initcall+0x184/0x464
swapper/0:  do_initcall_level+0xdc/0x1b0
swapper/0:  do_initcalls+0x70/0xc0
swapper/0:  do_basic_setup+0x1c/0x28
swapper/0:  kernel_init_freeable+0x144/0x1b8
swapper/0:  kernel_init+0x20/0x1a8
swapper/0:  ret_from_fork+0x10/0x20
==================================================================

Fixes: 1a83a716ec ("mm: krealloc: consider spare memory for __GFP_ZERO")
Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
2024-10-29 10:40:53 +01:00
Barry Song
01626a1823 mm: avoid unconditional one-tick sleep when swapcache_prepare fails
Commit 13ddaf26be ("mm/swap: fix race when skipping swapcache")
introduced an unconditional one-tick sleep when `swapcache_prepare()`
fails, which has led to reports of UI stuttering on latency-sensitive
Android devices.  To address this, we can use a waitqueue to wake up tasks
that fail `swapcache_prepare()` sooner, instead of always sleeping for a
full tick.  While tasks may occasionally be woken by an unrelated
`do_swap_page()`, this method is preferable to two scenarios: rapid
re-entry into page faults, which can cause livelocks, and multiple
millisecond sleeps, which visibly degrade user experience.

Oven's testing shows that a single waitqueue resolves the UI stuttering
issue.  If a 'thundering herd' problem becomes apparent later, a waitqueue
hash similar to `folio_wait_table[PAGE_WAIT_TABLE_SIZE]` for page bit
locks can be introduced.

[v-songbaohua@oppo.com: wake_up only when swapcache_wq waitqueue is active]
  Link: https://lkml.kernel.org/r/20241008130807.40833-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240926211936.75373-1-21cnbao@gmail.com
Fixes: 13ddaf26be ("mm/swap: fix race when skipping swapcache")
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reported-by: Oven Liyang <liyangouwen1@oppo.com>
Tested-by: Oven Liyang <liyangouwen1@oppo.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:41 -07:00
Jeff Xu
1834300798 mseal: update mseal.rst
Pedro Falcato's optimization [1] for checking sealed VMAs, which replaces
the can_modify_mm() function with an in-loop check, necessitates an update
to the mseal.rst documentation to reflect this change.

Furthermore, the document has received offline comments regarding the code
sample and suggestions for sentence clarification to enhance reader
comprehension.

[1] https://lore.kernel.org/linux-mm/20240817-mseal-depessimize-v3-0-d8d2e037df30@gmail.com/

Update doc after in-loop change: mprotect/madvise can have
partially updated and munmap is atomic.

Fix indentation and clarify some sections to improve readability.

Link: https://lkml.kernel.org/r/20241008040942.1478931-2-jeffxu@chromium.org
Fixes: df2a7df9a9 ("mm/munmap: replace can_modify_mm with can_modify_vma")
Fixes: 4a2dd02b09 ("mm/mprotect: replace can_modify_mm with can_modify_vma")
Fixes: 38075679b5 ("mm/mremap: replace can_modify_mm with can_modify_vma")
Fixes: 23c57d1fa2 ("mseal: replace can_modify_mm_madv with a vma variant")
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Elliott Hughes <enh@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Stephen Röttger <sroettger@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theo de Raadt" <deraadt@openbsd.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:41 -07:00
Kirill A. Shutemov
58a039e679 mm: split critical region in remap_file_pages() and invoke LSMs in between
Commit ea7e2d5e49 ("mm: call the security_mmap_file() LSM hook in
remap_file_pages()") fixed a security issue, it added an LSM check when
trying to remap file pages, so that LSMs have the opportunity to evaluate
such action like for other memory operations such as mmap() and
mprotect().

However, that commit called security_mmap_file() inside the mmap_lock
lock, while the other calls do it before taking the lock, after commit
8b3ec6814c ("take security_mmap_file() outside of ->mmap_sem").

This caused lock inversion issue with IMA which was taking the mmap_lock
and i_mutex lock in the opposite way when the remap_file_pages() system
call was called.

Solve the issue by splitting the critical region in remap_file_pages() in
two regions: the first takes a read lock of mmap_lock, retrieves the VMA
and the file descriptor associated, and calculates the 'prot' and 'flags'
variables; the second takes a write lock on mmap_lock, checks that the VMA
flags and the VMA file descriptor are the same as the ones obtained in the
first critical region (otherwise the system call fails), and calls
do_mmap().

In between, after releasing the read lock and before taking the write
lock, call security_mmap_file(), and solve the lock inversion issue.

Link: https://lkml.kernel.org/r/20241018161415.3845146-1-roberto.sassu@huaweicloud.com
Fixes: ea7e2d5e49 ("mm: call the security_mmap_file() LSM hook in remap_file_pages()")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reported-by: syzbot+1cd571a672400ef3a930@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-security-module/66f7b10e.050a0220.46d20.0036.GAE@google.com/
Tested-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Paul Moore <paul@paul-moore.com>
Tested-by: syzbot+1cd571a672400ef3a930@syzkaller.appspotmail.com
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Eric Snowberg <eric.snowberg@oracle.com>
Cc: James Morris <jmorris@namei.org>
Cc: Mimi Zohar <zohar@linux.ibm.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Shu Han <ebpqwerty472123@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:41 -07:00
Edward Liaw
f2330b650e selftests/mm: fix deadlock for fork after pthread_create with atomic_bool
Some additional synchronization is needed on Android ARM64; we see a
deadlock with pthread_create when the parent thread races forward before
the child has a chance to start doing work.

Link: https://lkml.kernel.org/r/20241018171734.2315053-4-edliaw@google.com
Fixes: cff2945827 ("selftests/mm: extend and rename uffd pagemap test")
Signed-off-by: Edward Liaw <edliaw@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:41 -07:00
Edward Liaw
3673167a3a Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
This reverts commit e61ef21e27.

uffd_poll_thread may be called by other tests that do not initialize the
pthread_barrier, so this approach is not correct.  This will revert to
using atomic_bool instead.

Link: https://lkml.kernel.org/r/20241018171734.2315053-3-edliaw@google.com
Fixes: e61ef21e27 ("selftests/mm: replace atomic_bool with pthread_barrier_t")
Signed-off-by: Edward Liaw <edliaw@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:41 -07:00
Edward Liaw
5bb1f4c934 Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
Patch series "selftests/mm: revert pthread_barrier change"

On Android arm, pthread_create followed by a fork caused a deadlock in
the case where the fork required work to be completed by the created
thread.

The previous patches incorrectly assumed that the parent would
always initialize the pthread_barrier for the child thread.  This
reverts the change and replaces the fix for wp-fork-with-event with the
original use of atomic_bool.


This patch (of 3):

This reverts commit e142cc87ac.

fork_event_consumer may be called by other tests that do not initialize
the pthread_barrier, so this approach is not correct.  The subsequent
patch will revert to using atomic_bool instead.

Link: https://lkml.kernel.org/r/20241018171734.2315053-1-edliaw@google.com
Link: https://lkml.kernel.org/r/20241018171734.2315053-2-edliaw@google.com
Fixes: e142cc87ac ("fix deadlock for fork after pthread_create on ARM")
Signed-off-by: Edward Liaw <edliaw@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Lorenzo Stoakes
e8133a7799 tools: testing: add expand-only mode VMA test
Add a test to assert that VMG_FLAG_JUST_EXPAND functions as expected - that
is, when the VMA iterator is positioned at the previous VMA and no VMAs
proceed it, we observe an expansion with all state as expected.

Explicitly place a prior VMA that would otherwise fail this test if the
mode were not enabled (as it would traverse to the previous-previous VMA).

Link: https://lkml.kernel.org/r/d2f88330254a6448092412bf7dfe077a579ab0dc.1729174352.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Lorenzo Stoakes
c4d91e225f mm/vma: add expand-only VMA merge mode and optimise do_brk_flags()
Patch series "introduce VMA merge mode to improve brk() performance".

A ~5% performance regression was discovered on the
aim9.brk_test.ops_per_sec by the linux kernel test bot [0].

In the past to satisfy brk() performance we duplicated VMA expansion code
and special-cased do_brk_flags().  This is however horrid and undoes work
to abstract this logic, so in resolving the issue I have endeavoured to
avoid this.

Investigating further I was able to observe that the use of a
vma_iter_next_range() and vma_prev() pair, causing an unnecessary maple
tree walk.  In addition there is work that we do that is simply
unnecessary for brk().

Therefore, add a special VMA merge mode VMG_FLAG_JUST_EXPAND to avoid
doing any of this - it assumes the VMA iterator is pointing at the
previous VMA and which skips logic that brk() does not require.

This mostly eliminates the performance regression reducing it to ~2% which
is in the realm of noise.  In addition, the will-it-scale test brk2,
written to be more representative of real-world brk() usage, shows a
modest performance improvement - which gives me confidence that we are not
meaningfully regressing real workloads here.

This series includes a test asserting that the 'just expand' mode works as
expected.

With many thanks to Oliver Sang for helping with performance testing of
candidate patch sets!

[0]:https://lore.kernel.org/linux-mm/202409301043.629bea78-oliver.sang@intel.com


This patch (of 2):

We know in advance that do_brk_flags() wants only to perform a VMA
expansion (if the prior VMA is compatible), and that we assume no
mergeable VMA follows it.

These are the semantics of this function prior to the recent rewrite of
the VMA merging logic, however we are now doing more work than necessary -
positioning the VMA iterator at the prior VMA and performing tasks that
are not required.

Add a new field to the vmg struct to permit merge flags and add a new
merge flag VMG_FLAG_JUST_EXPAND which implies this behaviour, and have
do_brk_flags() use this.

This fixes a reported performance regression in a brk() benchmarking suite.

Link: https://lkml.kernel.org/r/cover.1729174352.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4e65d4395e5841c5acf8470dbcb714016364fd39.1729174352.git.lorenzo.stoakes@oracle.com
Fixes: cacded5e42 ("mm: avoid using vma_merge() for new VMAs")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/linux-mm/202409301043.629bea78-oliver.sang@intel.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Gregory Price
b125a0def2 resource,kexec: walk_system_ram_res_rev must retain resource flags
walk_system_ram_res_rev() erroneously discards resource flags when passing
the information to the callback.

This causes systems with IORESOURCE_SYSRAM_DRIVER_MANAGED memory to have
these resources selected during kexec to store kexec buffers if that
memory happens to be at placed above normal system ram.

This leads to undefined behavior after reboot.  If the kexec buffer is
never touched, nothing happens.  If the kexec buffer is touched, it could
lead to a crash (like below) or undefined behavior.

Tested on a system with CXL memory expanders with driver managed memory,
TPM enabled, and CONFIG_IMA_KEXEC=y.  Adding printk's showed the flags
were being discarded and as a result the check for
IORESOURCE_SYSRAM_DRIVER_MANAGED passes.

find_next_iomem_res: name(System RAM (kmem))
		     start(10000000000)
		     end(1034fffffff)
		     flags(83000200)

locate_mem_hole_top_down: start(10000000000) end(1034fffffff) flags(0)

[.] BUG: unable to handle page fault for address: ffff89834ffff000
[.] #PF: supervisor read access in kernel mode
[.] #PF: error_code(0x0000) - not-present page
[.] PGD c04c8bf067 P4D c04c8bf067 PUD c04c8be067 PMD 0
[.] Oops: 0000 [#1] SMP
[.] RIP: 0010:ima_restore_measurement_list+0x95/0x4b0
[.] RSP: 0018:ffffc900000d3a80 EFLAGS: 00010286
[.] RAX: 0000000000001000 RBX: 0000000000000000 RCX: ffff89834ffff000
[.] RDX: 0000000000000018 RSI: ffff89834ffff000 RDI: ffff89834ffff018
[.] RBP: ffffc900000d3ba0 R08: 0000000000000020 R09: ffff888132b8a900
[.] R10: 4000000000000000 R11: 000000003a616d69 R12: 0000000000000000
[.] R13: ffffffff8404ac28 R14: 0000000000000000 R15: ffff89834ffff000
[.] FS:  0000000000000000(0000) GS:ffff893d44640000(0000) knlGS:0000000000000000
[.] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[.] ata5: SATA link down (SStatus 0 SControl 300)
[.] CR2: ffff89834ffff000 CR3: 000001034d00f001 CR4: 0000000000770ef0
[.] PKRU: 55555554
[.] Call Trace:
[.]  <TASK>
[.]  ? __die+0x78/0xc0
[.]  ? page_fault_oops+0x2a8/0x3a0
[.]  ? exc_page_fault+0x84/0x130
[.]  ? asm_exc_page_fault+0x22/0x30
[.]  ? ima_restore_measurement_list+0x95/0x4b0
[.]  ? template_desc_init_fields+0x317/0x410
[.]  ? crypto_alloc_tfm_node+0x9c/0xc0
[.]  ? init_ima_lsm+0x30/0x30
[.]  ima_load_kexec_buffer+0x72/0xa0
[.]  ima_init+0x44/0xa0
[.]  __initstub__kmod_ima__373_1201_init_ima7+0x1e/0xb0
[.]  ? init_ima_lsm+0x30/0x30
[.]  do_one_initcall+0xad/0x200
[.]  ? idr_alloc_cyclic+0xaa/0x110
[.]  ? new_slab+0x12c/0x420
[.]  ? new_slab+0x12c/0x420
[.]  ? number+0x12a/0x430
[.]  ? sysvec_apic_timer_interrupt+0xa/0x80
[.]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
[.]  ? parse_args+0xd4/0x380
[.]  ? parse_args+0x14b/0x380
[.]  kernel_init_freeable+0x1c1/0x2b0
[.]  ? rest_init+0xb0/0xb0
[.]  kernel_init+0x16/0x1a0
[.]  ret_from_fork+0x2f/0x40
[.]  ? rest_init+0xb0/0xb0
[.]  ret_from_fork_asm+0x11/0x20
[.]  </TASK>

Link: https://lore.kernel.org/all/20231114091658.228030-1-bhe@redhat.com/
Link: https://lkml.kernel.org/r/20241017190347.5578-1-gourry@gourry.net
Fixes: 7acf164b25 ("resource: add walk_system_ram_res_rev()")
Signed-off-by: Gregory Price <gourry@gourry.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Ryusuke Konishi
41e192ad27 nilfs2: fix kernel bug due to missing clearing of checked flag
Syzbot reported that in directory operations after nilfs2 detects
filesystem corruption and degrades to read-only,
__block_write_begin_int(), which is called to prepare block writes, may
fail the BUG_ON check for accesses exceeding the folio/page size,
triggering a kernel bug.

This was found to be because the "checked" flag of a page/folio was not
cleared when it was discarded by nilfs2's own routine, which causes the
sanity check of directory entries to be skipped when the directory
page/folio is reloaded.  So, fix that.

This was necessary when the use of nilfs2's own page discard routine was
applied to more than just metadata files.

Link: https://lkml.kernel.org/r/20241017193359.5051-1-konishi.ryusuke@gmail.com
Fixes: 8c26c4e269 ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+d6ca2daf692c7a82f959@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d6ca2daf692c7a82f959
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Nobuhiro Iwamatsu
d95fb348f0 mm: numa_clear_kernel_node_hotplug: Add NUMA_NO_NODE check for node id
The acquired memory blocks for reserved may include blocks outside of
memory management.  In this case, the nid variable is set to NUMA_NO_NODE
(-1), so an error occurs in node_set().  This adds a check using
numa_valid_node() to numa_clear_kernel_node_hotplug() that skips
node_set() when nid is set to NUMA_NO_NODE.

Link: https://lkml.kernel.org/r/1729070461-13576-1-git-send-email-nobuhiro1.iwamatsu@toshiba.co.jp
Fixes: 8748270821 ("mm: introduce numa_memblks")
Signed-off-by: Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: Yuji Ishikawa <yuji2.ishikawa@toshiba.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Edward Adam Davis
bc0a2f3a73 ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow
Syzbot reported a kernel BUG in ocfs2_truncate_inline.  There are two
reasons for this: first, the parameter value passed is greater than
ocfs2_max_inline_data_with_xattr, second, the start and end parameters of
ocfs2_truncate_inline are "unsigned int".

So, we need to add a sanity check for byte_start and byte_len right before
ocfs2_truncate_inline() in ocfs2_remove_inode_range(), if they are greater
than ocfs2_max_inline_data_with_xattr return -EINVAL.

Link: https://lkml.kernel.org/r/tencent_D48DB5122ADDAEDDD11918CFB68D93258C07@qq.com
Fixes: 1afc32b952 ("ocfs2: Write support for inline data")
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Reported-by: syzbot+81092778aac03460d6b7@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=81092778aac03460d6b7
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:40 -07:00
Jeongjun Park
d949d1d14f mm: shmem: fix data-race in shmem_getattr()
I got the following KCSAN report during syzbot testing:

==================================================================
BUG: KCSAN: data-race in generic_fillattr / inode_set_ctime_current

write to 0xffff888102eb3260 of 4 bytes by task 6565 on cpu 1:
 inode_set_ctime_to_ts include/linux/fs.h:1638 [inline]
 inode_set_ctime_current+0x169/0x1d0 fs/inode.c:2626
 shmem_mknod+0x117/0x180 mm/shmem.c:3443
 shmem_create+0x34/0x40 mm/shmem.c:3497
 lookup_open fs/namei.c:3578 [inline]
 open_last_lookups fs/namei.c:3647 [inline]
 path_openat+0xdbc/0x1f00 fs/namei.c:3883
 do_filp_open+0xf7/0x200 fs/namei.c:3913
 do_sys_openat2+0xab/0x120 fs/open.c:1416
 do_sys_open fs/open.c:1431 [inline]
 __do_sys_openat fs/open.c:1447 [inline]
 __se_sys_openat fs/open.c:1442 [inline]
 __x64_sys_openat+0xf3/0x120 fs/open.c:1442
 x64_sys_call+0x1025/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:258
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

read to 0xffff888102eb3260 of 4 bytes by task 3498 on cpu 0:
 inode_get_ctime_nsec include/linux/fs.h:1623 [inline]
 inode_get_ctime include/linux/fs.h:1629 [inline]
 generic_fillattr+0x1dd/0x2f0 fs/stat.c:62
 shmem_getattr+0x17b/0x200 mm/shmem.c:1157
 vfs_getattr_nosec fs/stat.c:166 [inline]
 vfs_getattr+0x19b/0x1e0 fs/stat.c:207
 vfs_statx_path fs/stat.c:251 [inline]
 vfs_statx+0x134/0x2f0 fs/stat.c:315
 vfs_fstatat+0xec/0x110 fs/stat.c:341
 __do_sys_newfstatat fs/stat.c:505 [inline]
 __se_sys_newfstatat+0x58/0x260 fs/stat.c:499
 __x64_sys_newfstatat+0x55/0x70 fs/stat.c:499
 x64_sys_call+0x141f/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:263
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

value changed: 0x2755ae53 -> 0x27ee44d3

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 UID: 0 PID: 3498 Comm: udevd Not tainted 6.11.0-rc6-syzkaller-00326-gd1f2d51b711a-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024
==================================================================

When calling generic_fillattr(), if you don't hold read lock, data-race
will occur in inode member variables, which can cause unexpected
behavior.

Since there is no special protection when shmem_getattr() calls
generic_fillattr(), data-race occurs by functions such as shmem_unlink()
or shmem_mknod(). This can cause unexpected results, so commenting it out
is not enough.

Therefore, when calling generic_fillattr() from shmem_getattr(), it is
appropriate to protect the inode using inode_lock_shared() and
inode_unlock_shared() to prevent data-race.

Link: https://lkml.kernel.org/r/20240909123558.70229-1-aha310510@gmail.com
Fixes: 44a30220bc ("shmem: recalculate file inode when fstat")
Signed-off-by: Jeongjun Park <aha310510@gmail.com>
Reported-by: syzbot <syzkaller@googlegroup.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Jann Horn
14611508cb mm: mark mas allocation in vms_abort_munmap_vmas as __GFP_NOFAIL
vms_abort_munmap_vmas() is a recovery path where, on entry, some VMAs have
already been torn down halfway (in a way we can't undo) but are still
present in the maple tree.

At this point, we *must* remove the VMAs from the VMA tree, otherwise we
get UAF.

Because removing VMA tree nodes can require memory allocation, the
existing code has an error path which tries to handle this by reattaching
the VMAs; but that can't be done safely.

A nicer way to fix it would probably be to preallocate enough maple tree
nodes for the removal before the point of no return, or something like
that; but for now, fix it the easy and kinda ugly way, by marking this
allocation __GFP_NOFAIL.

Link: https://lkml.kernel.org/r/20241016-fix-munmap-abort-v1-1-601c94b2240d@google.com
Fixes: 4f87153e82 ("mm: change failure of MAP_FIXED to restoring the gap on failure")
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Sabyrzhan Tasbolatov
1db272864f x86/traps: move kmsan check after instrumentation_begin
During x86_64 kernel build with CONFIG_KMSAN, the objtool warns following:

  AR      built-in.a
  AR      vmlinux.a
  LD      vmlinux.o
vmlinux.o: warning: objtool: handle_bug+0x4: call to
    kmsan_unpoison_entry_regs() leaves .noinstr.text section
  OBJCOPY modules.builtin.modinfo
  GEN     modules.builtin
  MODPOST Module.symvers
  CC      .vmlinux.export.o

Moving kmsan_unpoison_entry_regs() _after_ instrumentation_begin() fixes
the warning.

There is decode_bug(regs->ip, &imm) is left before KMSAN unpoisoining, but
it has the return condition and if we include it after
instrumentation_begin() it results the warning "return with
instrumentation enabled", hence, I'm concerned that regs will not be KMSAN
unpoisoned if `ud_type == BUG_NONE` is true.

Link: https://lkml.kernel.org/r/20241016152407.3149001-1-snovitoll@gmail.com
Fixes: ba54d194f8 ("x86/traps: avoid KMSAN bugs originating from handle_bug()")
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Huang Ying
b7c5f9a1fb resource: remove dependency on SPARSEMEM from GET_FREE_REGION
We want to use the functions (get_free_mem_region()) configured via
GET_FREE_REGION in resource kunit tests.  However, GET_FREE_REGION
depends on SPARSEMEM now.  This makes resource kunit tests cannot be
built on some architectures lacking SPARSEMEM, or causes config warning
as follows,

  WARNING: unmet direct dependencies detected for GET_FREE_REGION
  Depends on [n]: SPARSEMEM [=n]
  Selected by [y]:
  - RESOURCE_KUNIT_TEST [=y] && RUNTIME_TESTING_MENU [=y] && KUNIT [=y]

When get_free_mem_region() was introduced the only consumers were those
looking to pass the address range to memremap_pages().  That address
range needed to be mindful of the maximum addressable platform physical
address which at the time only SPARSMEM defined via MAX_PHYSMEM_BITS.

Given that memremap_pages() also depended on SPARSEMEM via ZONE_DEVICE,
it was easier to just depend on that definition than invent a general
MAX_PHYSMEM_BITS concept outside of SPARSEMEM.

Turns out that decision was buggy and did not account for KASAN
consumption of physical address space.  That problem was resolved
recently with commit ea72ce5da2 ("x86/kaslr: Expose and use the end
of the physical memory address space"), and GET_FREE_REGION dropped its
MAX_PHYSMEM_BITS dependency.

Then commit 99185c10d5 ("resource, kunit: add test case for
region_intersects()"), went ahead and fixed up the only remaining
dependency on SPARSEMEM which was usage of the PA_SECTION_SHIFT macro
for setting the default alignment.  A PAGE_SIZE fallback is fine in the
SPARSEMEM=n case.

With those build dependencies gone GET_FREE_REGION no longer depends on
SPARSEMEM.  So, the patch removes dependency on SPARSEMEM from
GET_FREE_REGION to fix the build issues.

Link: https://lkml.kernel.org/r/20241016014730.339369-1-ying.huang@intel.com
Link: https://lore.kernel.org/lkml/20240922225041.603186-1-linux@roeck-us.net/
Link: https://lkml.kernel.org/r/20241015051554.294734-1-ying.huang@intel.com
Fixes: 99185c10d5 ("resource, kunit: add test case for region_intersects()")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Liam R. Howlett
79f3d123ca mm/mmap: fix race in mmap_region() with ftruncate()
Avoiding the zeroing of the vma tree in mmap_region() introduced a race
with truncate in the page table walk.  To avoid any races, create a hole
in the rmap during the operation by clearing the pagetable entries earlier
under the mmap write lock and (critically) before the new vma is installed
into the vma tree.  The result is that the old vma(s) are left in the vma
tree, but free_pgtables() removes them from the rmap and clears the ptes
while holding the necessary locks.

This change extends the fix required for hugetblfs and the call_mmap()
function by moving the cleanup higher in the function and running it
unconditionally.

Link: https://lkml.kernel.org/r/20241016013455.2241533-1-Liam.Howlett@oracle.com
Fixes: f8d112a4e6 ("mm/mmap: avoid zeroing vma tree in mmap_region()")
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Closes: https://lore.kernel.org/all/CAG48ez0ZpGzxi=-5O_uGQ0xKXOmbjeQ0LjZsRJ1Qtf2X5eOr1w@mail.gmail.com/
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Matt Fleming
281dd25c1a mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves
Under memory pressure it's possible for GFP_ATOMIC order-0 allocations to
fail even though free pages are available in the highatomic reserves. 
GFP_ATOMIC allocations cannot trigger unreserve_highatomic_pageblock()
since it's only run from reclaim.

Given that such allocations will pass the watermarks in
__zone_watermark_unusable_free(), it makes sense to fallback to highatomic
reserves the same way that ALLOC_OOM can.

This fixes order-0 page allocation failures observed on Cloudflare's fleet
when handling network packets:

  kswapd1: page allocation failure: order:0, mode:0x820(GFP_ATOMIC),
  nodemask=(null),cpuset=/,mems_allowed=0-7
  CPU: 10 PID: 696 Comm: kswapd1 Kdump: loaded Tainted: G           O 6.6.43-CUSTOM #1
  Hardware name: MACHINE
  Call Trace:
   <IRQ>
   dump_stack_lvl+0x3c/0x50
   warn_alloc+0x13a/0x1c0
   __alloc_pages_slowpath.constprop.0+0xc9d/0xd10
   __alloc_pages+0x327/0x340
   __napi_alloc_skb+0x16d/0x1f0
   bnxt_rx_page_skb+0x96/0x1b0 [bnxt_en]
   bnxt_rx_pkt+0x201/0x15e0 [bnxt_en]
   __bnxt_poll_work+0x156/0x2b0 [bnxt_en]
   bnxt_poll+0xd9/0x1c0 [bnxt_en]
   __napi_poll+0x2b/0x1b0
   bpf_trampoline_6442524138+0x7d/0x1000
   __napi_poll+0x5/0x1b0
   net_rx_action+0x342/0x740
   handle_softirqs+0xcf/0x2b0
   irq_exit_rcu+0x6c/0x90
   sysvec_apic_timer_interrupt+0x72/0x90
   </IRQ>

[mfleming@cloudflare.com: update comment]
  Link: https://lkml.kernel.org/r/20241015125158.3597702-1-matt@readmodwrite.com
Link: https://lkml.kernel.org/r/20241011120737.3300370-1-matt@readmodwrite.com
Link: https://lore.kernel.org/all/CAGis_TWzSu=P7QJmjD58WWiu3zjMTVKSzdOwWE8ORaGytzWJwQ@mail.gmail.com/
Fixes: 1d91df85f3 ("mm/page_alloc: handle a missing case for memalloc_nocma_{save/restore} APIs")
Signed-off-by: Matt Fleming <mfleming@cloudflare.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Lorenzo Stoakes
985da552a9 fork: only invoke khugepaged, ksm hooks if no error
There is no reason to invoke these hooks early against an mm that is in an
incomplete state.

The change in commit d240629148 ("fork: use __mt_dup() to duplicate
maple tree in dup_mmap()") makes this more pertinent as we may be in a
state where entries in the maple tree are not yet consistent.

Their placement early in dup_mmap() only appears to have been meaningful
for early error checking, and since functionally it'd require a very small
allocation to fail (in practice 'too small to fail') that'd only occur in
the most dire circumstances, meaning the fork would fail or be OOM'd in
any case.

Since both khugepaged and KSM tracking are there to provide optimisations
to memory performance rather than critical functionality, it doesn't
really matter all that much if, under such dire memory pressure, we fail
to register an mm with these.

As a result, we follow the example of commit d2081b2bf8 ("mm:
khugepaged: make khugepaged_enter() void function") and make ksm_fork() a
void function also.

We only expose the mm to these functions once we are done with them and
only if no error occurred in the fork operation.

Link: https://lkml.kernel.org/r/e0cb8b840c9d1d5a6e84d4f8eff5f3f2022aa10c.1729014377.git.lorenzo.stoakes@oracle.com
Fixes: d240629148 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:39 -07:00
Lorenzo Stoakes
f64e67e5d3 fork: do not invoke uffd on fork if error occurs
Patch series "fork: do not expose incomplete mm on fork".

During fork we may place the virtual memory address space into an
inconsistent state before the fork operation is complete.

In addition, we may encounter an error during the fork operation that
indicates that the virtual memory address space is invalidated.

As a result, we should not be exposing it in any way to external machinery
that might interact with the mm or VMAs, machinery that is not designed to
deal with incomplete state.

We specifically update the fork logic to defer khugepaged and ksm to the
end of the operation and only to be invoked if no error arose, and
disallow uffd from observing fork events should an error have occurred.


This patch (of 2):

Currently on fork we expose the virtual address space of a process to
userland unconditionally if uffd is registered in VMAs, regardless of
whether an error arose in the fork.

This is performed in dup_userfaultfd_complete() which is invoked
unconditionally, and performs two duties - invoking registered handlers
for the UFFD_EVENT_FORK event via dup_fctx(), and clearing down
userfaultfd_fork_ctx objects established in dup_userfaultfd().

This is problematic, because the virtual address space may not yet be
correctly initialised if an error arose.

The change in commit d240629148 ("fork: use __mt_dup() to duplicate
maple tree in dup_mmap()") makes this more pertinent as we may be in a
state where entries in the maple tree are not yet consistent.

We address this by, on fork error, ensuring that we roll back state that
we would otherwise expect to clean up through the event being handled by
userland and perform the memory freeing duty otherwise performed by
dup_userfaultfd_complete().

We do this by implementing a new function, dup_userfaultfd_fail(), which
performs the same loop, only decrementing reference counts.

Note that we perform mmgrab() on the parent and child mm's, however
userfaultfd_ctx_put() will mmdrop() this once the reference count drops to
zero, so we will avoid memory leaks correctly here.

Link: https://lkml.kernel.org/r/cover.1729014377.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/d3691d58bb58712b6fb3df2be441d175bd3cdf07.1729014377.git.lorenzo.stoakes@oracle.com
Fixes: d240629148 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:38 -07:00
David Hildenbrand
7c18d48110 mm/pagewalk: fix usage of pmd_leaf()/pud_leaf() without present check
pmd_leaf()/pud_leaf() only implies a pmd_present()/pud_present() check on
some architectures.  We really should check for
pmd_present()/pud_present() first.

This should explain the report we got on ppc64 (which has
CONFIG_PGTABLE_HAS_HUGE_LEAVES set in the config) that triggered:
	VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));

Likely we had a PMD migration entry for which pmd_leaf() did not trigger. 
We raced with restoring the PMD migration entry, and suddenly saw a
pmd_leaf().  In this case, pte_offset_map_lock() saved us from more
trouble, because it rechecks the PMD value, but we would not have
processed the migration entry -- which is not too bad because the only
user of FW_MIGRATION is KSM for unsharing, and KSM only applies to small
folios.

Further, we shouldn't re-read the PMD/PUD value for our warning, the
primary purpose of the VM_WARN_ON_ONCE() is to find spurious use of
pmd_leaf()/pud_leaf() without CONFIG_PGTABLE_HAS_HUGE_LEAVES.

As a side note, we are currently not implementing FW_MIGRATION support for
PUD migration entries, which likely should exist due to hugetlb.  Add a
TODO so this won't fall through the cracks if more FW_MIGRATION users get
added.

Was able to write a quick reproducer and verify that the issue no longer triggers with this fix.

https://gitlab.com/davidhildenbrand/scratchspace/-/blob/main/reproducers/move-pages-pmd-leaf.c

Without this fix after a couple of seconds in a VM with 2 NUMA nodes:

[   54.333753] ------------[ cut here ]------------
[   54.334901] WARNING: CPU: 20 PID: 1704 at mm/pagewalk.c:815 folio_walk_start+0x48f/0x6e0
[   54.336455] Modules linked in: ...
[   54.345009] CPU: 20 UID: 0 PID: 1704 Comm: move-pages-pmd- Not tainted 6.12.0-rc2+ #81
[   54.346529] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014
[   54.348191] RIP: 0010:folio_walk_start+0x48f/0x6e0
[   54.349134] Code: b5 ad 48 8d 35 00 00 00 00 e8 6d 59 d7 ff e8 08 74 da ff e9 9c fe ff ff 4c 8b 7c 24 08 4c 89 ff e8 26 2b be 00 e9 8a fe ff ff <0f> 0b e9 ec fe ff ff f7 c2 ff 0f 00 00 0f 85 81 fe ff ff 48 8b 02
[   54.352660] RSP: 0018:ffffb7e4c430bc78 EFLAGS: 00010282
[   54.353679] RAX: 80000002a3e008e7 RBX: ffff9946039aa580 RCX: ffff994380000000
[   54.355056] RDX: ffff994606aec000 RSI: 00007f004b000000 RDI: 0000000000000000
[   54.356440] RBP: 00007f004b000000 R08: 0000000000000591 R09: 0000000000000001
[   54.357820] R10: 0000000000000200 R11: 0000000000000001 R12: ffffb7e4c430bd10
[   54.359198] R13: ffff994606aec2c0 R14: 0000000000000002 R15: ffff994604a89b00
[   54.360564] FS:  00007f004ae006c0(0000) GS:ffff9947f7400000(0000) knlGS:0000000000000000
[   54.362111] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   54.363242] CR2: 00007f004adffe58 CR3: 0000000281e12005 CR4: 0000000000770ef0
[   54.364615] PKRU: 55555554
[   54.365153] Call Trace:
[   54.365646]  <TASK>
[   54.366073]  ? __warn.cold+0xb7/0x14d
[   54.366796]  ? folio_walk_start+0x48f/0x6e0
[   54.367628]  ? report_bug+0xff/0x140
[   54.368324]  ? handle_bug+0x58/0x90
[   54.369019]  ? exc_invalid_op+0x17/0x70
[   54.369771]  ? asm_exc_invalid_op+0x1a/0x20
[   54.370606]  ? folio_walk_start+0x48f/0x6e0
[   54.371415]  ? folio_walk_start+0x9e/0x6e0
[   54.372227]  do_pages_move+0x1c5/0x680
[   54.372972]  kernel_move_pages+0x1a1/0x2b0
[   54.373804]  __x64_sys_move_pages+0x25/0x30

Link: https://lkml.kernel.org/r/20241015111236.1290921-1-david@redhat.com
Fixes: aa39ca6940 ("mm/pagewalk: introduce folio_walk_start() + folio_walk_end()")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: syzbot+7d917f67c05066cec295@syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/670d3248.050a0220.3e960.0064.GAE@google.com
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-28 21:40:38 -07:00
Jarkko Sakkinen
df745e2509 tpm: Lazily flush the auth session
Move the allocation of chip->auth to tpm2_start_auth_session() so that this
field can be used as flag to tell whether auth session is active or not.

Instead of flushing and reloading the auth session for every transaction
separately, keep the session open unless /dev/tpm0 is used.

Reported-by: Pengyu Ma <mapengyu@gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219229
Cc: stable@vger.kernel.org # v6.10+
Fixes: 7ca110f267 ("tpm: Address !chip->auth in tpm_buf_append_hmac_session*()")
Tested-by: Pengyu Ma <mapengyu@gmail.com>
Tested-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
2024-10-29 00:46:20 +02:00
Ian Rogers
a5384c4267 perf cap: Add __NR_capget to arch/x86 unistd
As there are duplicated kernel headers in tools/include libc can pick
up the wrong definitions. This was causing the wrong system call for
capget in perf.

Reported-by: Adrian Hunter <adrian.hunter@intel.com>
Fixes: e25ebda78e ("perf cap: Tidy up and improve capability testing")
Closes: https://lore.kernel.org/lkml/cc7d6bdf-1aeb-4179-9029-4baf50b59342@intel.com/
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241026055448.312247-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-28 13:04:52 -03:00
Arnaldo Carvalho de Melo
55f1b540d8 tools headers: Update the linux/unaligned.h copy with the kernel sources
To pick up the changes in:

  7f053812da ("random: vDSO: minimize and simplify header includes")

That required adding a copy of include/vdso/unaligned.h and its checking
in tools/perf/check-headers.h.

Addressing this perf tools build warning:

  Warning: Kernel ABI header differences:
    diff -u tools/include/linux/unaligned.h include/linux/unaligned.h

Please see tools/include/uapi/README for further details.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Ian Rogers <irogers@google.com>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/Zx-uHvAbPAESofEN@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-28 12:34:28 -03:00
Arnaldo Carvalho de Melo
93e4b86b3e tools headers arm64: Sync arm64's cputype.h with the kernel sources
To get the changes in:

  924725707d ("arm64: cputype: Add Neoverse-N3 definitions")

That makes this perf source code to be rebuilt:

  CC      /tmp/build/perf-tools/util/arm-spe.o

The changes in the above patch add MIDR_NEOVERSE_N3, that probably need
changes in arm-spe.c, so probably we need to add it to that array?  Or
maybe we need to leave this for later when this is all tested on those
machines?

  static const struct midr_range neoverse_spe[] = {
          MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
          MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
          MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
          {},
  };

Mark Rutland recommended about arm-spe.c in a previous update to this
file:

"I would not touch this for now -- someone would have to go audit the
TRMs to check that those other cores have the same encoding, and I think
it'd be better to do that as a follow-up."

That addresses this perf build warning:

  Warning: Kernel ABI header differences:
    diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/Zx-dffKdGsgkhG96@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-28 12:33:50 -03:00
Arnaldo Carvalho de Melo
21a3a3d015 tools headers: Synchronize {uapi/}linux/bits.h with the kernel sources
To pick up the changes in this cset:

  947697c6f0 ("uapi: Define GENMASK_U128")

This addresses these perf build warnings:

  Warning: Kernel ABI header differences:
    diff -u tools/include/uapi/linux/bits.h include/uapi/linux/bits.h
    diff -u tools/include/linux/bits.h include/linux/bits.h

Please see tools/include/uapi/README for further details.

Acked-by: Yury Norov <yury.norov@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/Zx-ZVH7bHqtFn8Dv@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-28 12:32:25 -03:00
Jarkko Sakkinen
cc7d859434 tpm: Rollback tpm2_load_null()
Do not continue on tpm2_create_primary() failure in tpm2_load_null().

Cc: stable@vger.kernel.org # v6.10+
Fixes: eb24c9788c ("tpm: disable the TPM if NULL name changes")
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
2024-10-28 17:22:01 +02:00
Jarkko Sakkinen
d658d59471 tpm: Return tpm2_sessions_init() when null key creation fails
Do not continue tpm2_sessions_init() further if the null key pair creation
fails.

Cc: stable@vger.kernel.org # v6.10+
Fixes: d2add27cf2 ("tpm: Add NULL primary creation")
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
2024-10-28 17:22:01 +02:00
Tejun Heo
c31f2ee5cd sched_ext: Fix enq_last_no_enq_fails selftest
cc9877fb76 ("sched_ext: Improve error reporting during loading") changed
how load failures are reported so that more error context can be
communicated. This breaks the enq_last_no_enq_fails test as attach no longer
fails. The scheduler is guaranteed to be ejected on attach completion with
full error information. Update enq_last_no_enq_fails so that it checks that
the scheduler is ejected using ops.exit().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Vishal Chourasia <vishalc@linux.ibm.com>
Link: http://lkml.kernel.org/r/Zxknp7RAVNjmdJSc@linux.ibm.com
Fixes: cc9877fb76 ("sched_ext: Improve error reporting during loading")
2024-10-25 12:20:29 -10:00
Tejun Heo
7724abf0ca sched_ext: Make cast_mask() inline
cast_mask() doesn't do any actual work and is defined in a header file.
Force it to be inline. When it is not inlined and the function is not used,
it can cause verificaiton failures like the following:

  # tools/testing/selftests/sched_ext/runner -t minimal
  ===== START =====
  TEST: minimal
  DESCRIPTION: Verify we can load a fully minimal scheduler
  OUTPUT:
  libbpf: prog 'cast_mask': missing BPF prog type, check ELF section name '.text'
  libbpf: prog 'cast_mask': failed to load: -22
  libbpf: failed to load object 'minimal'
  libbpf: failed to load BPF skeleton 'minimal': -22
  ERR: minimal.c:20
  Failed to open and load skel
  not ok 1 minimal #
  =====  END  =====

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: a748db0c8c ("tools/sched_ext: Receive misc updates from SCX repo")
2024-10-25 12:19:44 -10:00
David Vernet
0e7ffff1b8 scx: Fix raciness in scx_ops_bypass()
scx_ops_bypass() can currently race on the ops enable / disable path as
follows:

1. scx_ops_bypass(true) called on enable path, bypass depth is set to 1
2. An op on the init path exits, which schedules scx_ops_disable_workfn()
3. scx_ops_bypass(false) is called on the disable path, and bypass depth
   is decremented to 0
4. kthread is scheduled to execute scx_ops_disable_workfn()
5. scx_ops_bypass(true) called, bypass depth set to 1
6. scx_ops_bypass() races when iterating over CPUs

While it's not safe to take any blocking locks on the bypass path, it is
safe to take a raw spinlock which cannot be preempted. This patch therefore
updates scx_ops_bypass() to use a raw spinlock to synchronize, and changes
scx_ops_bypass_depth to be a regular int.

Without this change, we observe the following warnings when running the
'exit' sched_ext selftest (sometimes requires a couple of runs):

.[root@virtme-ng sched_ext]# ./runner -t exit
===== START =====
TEST: exit
...
[   14.935078] WARNING: CPU: 2 PID: 360 at kernel/sched/ext.c:4332 scx_ops_bypass+0x1ca/0x280
[   14.935126] Modules linked in:
[   14.935150] CPU: 2 UID: 0 PID: 360 Comm: sched_ext_ops_h Not tainted 6.11.0-virtme #24
[   14.935192] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[   14.935242] Sched_ext: exit (enabling+all)
[   14.935244] RIP: 0010:scx_ops_bypass+0x1ca/0x280
[   14.935300] Code: ff ff ff e8 48 96 10 00 fb e9 08 ff ff ff c6 05 7b 34 e8 01 01 90 48 c7 c7 89 86 88 87 e8 be 1d f8 ff 90 0f 0b 90 90 eb 95 90 <0f> 0b 90 41 8b 84 24 24 0a 00 00 eb 97 90 0f 0b 90 41 8b 84 24 24
[   14.935394] RSP: 0018:ffffb706c0957ce0 EFLAGS: 00010002
[   14.935424] RAX: 0000000000000009 RBX: 0000000000000001 RCX: 00000000e3fb8b2a
[   14.935465] RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffffff88a4c080
[   14.935512] RBP: 0000000000009b56 R08: 0000000000000004 R09: 00000003f12e520a
[   14.935555] R10: ffffffff863a9795 R11: 0000000000000000 R12: ffff8fc5fec31300
[   14.935598] R13: ffff8fc5fec31318 R14: 0000000000000286 R15: 0000000000000018
[   14.935642] FS:  0000000000000000(0000) GS:ffff8fc5fe680000(0000) knlGS:0000000000000000
[   14.935684] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   14.935721] CR2: 0000557d92890b88 CR3: 000000002464a000 CR4: 0000000000750ef0
[   14.935765] PKRU: 55555554
[   14.935782] Call Trace:
[   14.935802]  <TASK>
[   14.935823]  ? __warn+0xce/0x220
[   14.935850]  ? scx_ops_bypass+0x1ca/0x280
[   14.935881]  ? report_bug+0xc1/0x160
[   14.935909]  ? handle_bug+0x61/0x90
[   14.935934]  ? exc_invalid_op+0x1a/0x50
[   14.935959]  ? asm_exc_invalid_op+0x1a/0x20
[   14.935984]  ? raw_spin_rq_lock_nested+0x15/0x30
[   14.936019]  ? scx_ops_bypass+0x1ca/0x280
[   14.936046]  ? srso_alias_return_thunk+0x5/0xfbef5
[   14.936081]  ? __pfx_scx_ops_disable_workfn+0x10/0x10
[   14.936111]  scx_ops_disable_workfn+0x146/0xac0
[   14.936142]  ? finish_task_switch+0xa9/0x2c0
[   14.936172]  ? srso_alias_return_thunk+0x5/0xfbef5
[   14.936211]  ? __pfx_scx_ops_disable_workfn+0x10/0x10
[   14.936244]  kthread_worker_fn+0x101/0x2c0
[   14.936268]  ? __pfx_kthread_worker_fn+0x10/0x10
[   14.936299]  kthread+0xec/0x110
[   14.936327]  ? __pfx_kthread+0x10/0x10
[   14.936351]  ret_from_fork+0x37/0x50
[   14.936374]  ? __pfx_kthread+0x10/0x10
[   14.936400]  ret_from_fork_asm+0x1a/0x30
[   14.936427]  </TASK>
[   14.936443] irq event stamp: 21002
[   14.936467] hardirqs last  enabled at (21001): [<ffffffff863aa35f>] resched_cpu+0x9f/0xd0
[   14.936521] hardirqs last disabled at (21002): [<ffffffff863dd0ba>] scx_ops_bypass+0x11a/0x280
[   14.936571] softirqs last  enabled at (20642): [<ffffffff863683d7>] __irq_exit_rcu+0x67/0xd0
[   14.936622] softirqs last disabled at (20637): [<ffffffff863683d7>] __irq_exit_rcu+0x67/0xd0
[   14.936672] ---[ end trace 0000000000000000 ]---
[   14.953282] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   14.953352] ------------[ cut here ]------------
[   14.953383] WARNING: CPU: 2 PID: 360 at kernel/sched/ext.c:4335 scx_ops_bypass+0x1d8/0x280
[   14.953428] Modules linked in:
[   14.953453] CPU: 2 UID: 0 PID: 360 Comm: sched_ext_ops_h Tainted: G        W          6.11.0-virtme #24
[   14.953505] Tainted: [W]=WARN
[   14.953527] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[   14.953574] RIP: 0010:scx_ops_bypass+0x1d8/0x280
[   14.953603] Code: c6 05 7b 34 e8 01 01 90 48 c7 c7 89 86 88 87 e8 be 1d f8 ff 90 0f 0b 90 90 eb 95 90 0f 0b 90 41 8b 84 24 24 0a 00 00 eb 97 90 <0f> 0b 90 41 8b 84 24 24 0a 00 00 eb 92 f3 0f 1e fa 49 8d 84 24 f0
[   14.953693] RSP: 0018:ffffb706c0957ce0 EFLAGS: 00010046
[   14.953722] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001
[   14.953763] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8fc5fec31318
[   14.953804] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
[   14.953845] R10: ffffffff863a9795 R11: 0000000000000000 R12: ffff8fc5fec31300
[   14.953888] R13: ffff8fc5fec31318 R14: 0000000000000286 R15: 0000000000000018
[   14.953934] FS:  0000000000000000(0000) GS:ffff8fc5fe680000(0000) knlGS:0000000000000000
[   14.953974] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   14.954009] CR2: 0000557d92890b88 CR3: 000000002464a000 CR4: 0000000000750ef0
[   14.954052] PKRU: 55555554
[   14.954068] Call Trace:
[   14.954085]  <TASK>
[   14.954102]  ? __warn+0xce/0x220
[   14.954126]  ? scx_ops_bypass+0x1d8/0x280
[   14.954150]  ? report_bug+0xc1/0x160
[   14.954178]  ? handle_bug+0x61/0x90
[   14.954203]  ? exc_invalid_op+0x1a/0x50
[   14.954226]  ? asm_exc_invalid_op+0x1a/0x20
[   14.954250]  ? raw_spin_rq_lock_nested+0x15/0x30
[   14.954285]  ? scx_ops_bypass+0x1d8/0x280
[   14.954311]  ? __mutex_unlock_slowpath+0x3a/0x260
[   14.954343]  scx_ops_disable_workfn+0xa3e/0xac0
[   14.954381]  ? __pfx_scx_ops_disable_workfn+0x10/0x10
[   14.954413]  kthread_worker_fn+0x101/0x2c0
[   14.954442]  ? __pfx_kthread_worker_fn+0x10/0x10
[   14.954479]  kthread+0xec/0x110
[   14.954507]  ? __pfx_kthread+0x10/0x10
[   14.954530]  ret_from_fork+0x37/0x50
[   14.954553]  ? __pfx_kthread+0x10/0x10
[   14.954576]  ret_from_fork_asm+0x1a/0x30
[   14.954603]  </TASK>
[   14.954621] irq event stamp: 21002
[   14.954644] hardirqs last  enabled at (21001): [<ffffffff863aa35f>] resched_cpu+0x9f/0xd0
[   14.954686] hardirqs last disabled at (21002): [<ffffffff863dd0ba>] scx_ops_bypass+0x11a/0x280
[   14.954735] softirqs last  enabled at (20642): [<ffffffff863683d7>] __irq_exit_rcu+0x67/0xd0
[   14.954782] softirqs last disabled at (20637): [<ffffffff863683d7>] __irq_exit_rcu+0x67/0xd0
[   14.954829] ---[ end trace 0000000000000000 ]---
[   15.022283] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   15.092282] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   15.149282] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
ok 1 exit #
=====  END  =====

And with it, the test passes without issue after 1000s of runs:

.[root@virtme-ng sched_ext]# ./runner -t exit
===== START =====
TEST: exit
DESCRIPTION: Verify we can cleanly exit a scheduler in multiple places
OUTPUT:
[    7.412856] sched_ext: BPF scheduler "exit" enabled
[    7.427924] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[    7.466677] sched_ext: BPF scheduler "exit" enabled
[    7.475923] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[    7.512803] sched_ext: BPF scheduler "exit" enabled
[    7.532924] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[    7.586809] sched_ext: BPF scheduler "exit" enabled
[    7.595926] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[    7.661923] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[    7.723923] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
ok 1 exit #
=====  END  =====

=============================

RESULTS:

PASSED:  1
SKIPPED: 0
FAILED:  0

Fixes: f0e1a0643a ("sched_ext: Implement BPF extensible scheduler class")
Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2024-10-25 11:10:51 -10:00
Peter Wang
cb7e509c4e scsi: ufs: core: Fix another deadlock during RTC update
If ufshcd_rtc_work calls ufshcd_rpm_put_sync() and the pm's usage_count
is 0, we will enter the runtime suspend callback.  However, the runtime
suspend callback will wait to flush ufshcd_rtc_work, causing a deadlock.

Replace ufshcd_rpm_put_sync() with ufshcd_rpm_put() to avoid the
deadlock.

Fixes: 6bf999e0eb ("scsi: ufs: core: Add UFS RTC support")
Cc: stable@vger.kernel.org #6.11.x
Signed-off-by: Peter Wang <peter.wang@mediatek.com>
Link: https://lore.kernel.org/r/20241024015453.21684-1-peter.wang@mediatek.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2024-10-25 14:51:34 -04:00
John Garry
d28d17a845 scsi: scsi_debug: Fix do_device_access() handling of unexpected SG copy length
If the sg_copy_buffer() call returns less than sdebug_sector_size, then
we drop out of the copy loop. However, we still report that we copied
the full expected amount, which is not proper.

Fix by keeping a running total and return that value.

Fixes: 84f3a3c01d ("scsi: scsi_debug: Atomic write support")
Reported-by: Colin Ian King <colin.i.king@gmail.com>
Suggested-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241018101655.4207-1-john.g.garry@oracle.com
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2024-10-25 14:48:27 -04:00
David Vernet
895669fd0d scx: Fix exit selftest to use custom DSQ
In commit 63fb3ec805 ("sched_ext: Allow only user DSQs for
scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new()"), we
updated the consume path to only accept user DSQs, thus making it invalid
to consume SCX_DSQ_GLOBAL. This selftest was doing that, so let's create a
custom DSQ and use that instead.  The test now passes:

[root@virtme-ng sched_ext]# ./runner -t exit
===== START =====
TEST: exit
DESCRIPTION: Verify we can cleanly exit a scheduler in multiple places
OUTPUT:
[   12.387229] sched_ext: BPF scheduler "exit" enabled
[   12.406064] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   12.453325] sched_ext: BPF scheduler "exit" enabled
[   12.474064] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   12.515241] sched_ext: BPF scheduler "exit" enabled
[   12.532064] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   12.592063] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   12.654063] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
[   12.715062] sched_ext: BPF scheduler "exit" disabled (unregistered from BPF)
ok 1 exit #
=====  END  =====

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2024-10-25 07:03:36 -10:00
Vishal Chourasia
4f7f417042 sched_ext: Fix function pointer type mismatches in BPF selftests
Fix incompatible function pointer type warnings in sched_ext BPF selftests by
explicitly casting the function pointers when initializing struct_ops.
This addresses multiple -Wincompatible-function-pointer-types warnings from the
clang compiler where function signatures didn't match exactly.

The void * cast ensures the compiler accepts the function pointer
assignment despite minor type differences in the parameters.

Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2024-10-24 06:56:17 -10:00
Arnaldo Carvalho de Melo
08a7d25255 tools arch x86: Sync the msr-index.h copy with the kernel sources
To pick up the changes from these csets:

  dc1e67f70f ("KVM VMX: Move MSR_IA32_VMX_MISC bit defines to asm/vmx.h")
  d7bfc9ffd5 ("KVM: VMX: Move MSR_IA32_VMX_BASIC bit defines to asm/vmx.h")
  beb2e44604 ("x86/cpu: KVM: Move macro to encode PAT value to common header")
  e7e80b66fb ("x86/cpu: KVM: Add common defines for architectural memory types (PAT, MTRRs, etc.)")

That cause no changes to tooling:

  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before
  $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h
  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after
  $ diff -u before after
  $

To see how this works take a look at this previous update:

  https://git.kernel.org/torvalds/c/174372668933ede5

  1743726689 ("tools arch x86: Sync the msr-index.h copy with the kernel sources to pick IA32_MKTME_KEYID_PARTITIONING")

Just silences this perf build warning:

  Warning: Kernel ABI header differences:
    diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h

Please see tools/include/uapi/README for further details.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Xin Li <xin3.li@intel.com>
Link: https://lore.kernel.org/lkml/ZxpLSBzGin3vjs3b@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-24 10:27:59 -03:00
Arnaldo Carvalho de Melo
758f181589 perf python: Fix up the build on architectures without HAVE_KVM_STAT_SUPPORT
Noticed while building on a raspbian arm 32-bit system.

There was also this other case, fixed by adding a missing util/stat.h
with the prototypes:

  /tmp/tmp.MbiSHoF3dj/perf-6.12.0-rc3/tools/perf/util/python.c:1396:6: error: no previous prototype for ‘perf_stat__set_no_csv_summary’ [-Werror=missing-prototypes]
   1396 | void perf_stat__set_no_csv_summary(int set __maybe_unused)
        |      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /tmp/tmp.MbiSHoF3dj/perf-6.12.0-rc3/tools/perf/util/python.c:1400:6: error: no previous prototype for ‘perf_stat__set_big_num’ [-Werror=missing-prototypes]
   1400 | void perf_stat__set_big_num(int set __maybe_unused)
        |      ^~~~~~~~~~~~~~~~~~~~~~
  cc1: all warnings being treated as errors

In other architectures this must be building due to some lucky indirect
inclusion of that header.

Fixes: 9dabf40034 ("perf python: Switch module to linking libraries from building source")
Reviewed-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZxllAtpmEw5fg9oy@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 19:29:50 -03:00
Veronika Molnarova
06a130e42a perf test: Handle perftool-testsuite_probe failure due to broken DWARF
Test case test_adding_blacklisted ends in failure if the blacklisted
probe is of an assembler function with no DWARF available. At the same
time, probing the blacklisted function with ASM DWARF doesn't test the
blacklist itself as the failure is a result of the broken DWARF.

When the broken DWARF output is encountered, check if the probed
function was compiled by the assembler. If so, the broken DWARF message
is expected and does not report a perf issue, else report a failure.  If
the ASM DWARF affected the probe, try the next probe on the blacklist.
If the first 5 probes are defective due to broken DWARF, skip the test
case.

Fixes: def5480d63 ("perf testsuite probe: Add test for blacklisted kprobes handling")
Signed-off-by: Veronika Molnarova <vmolnaro@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Veronika Molnarova <vmolnaro@redhat.com>
Link: https://lore.kernel.org/r/20241017161555.236769-1-vmolnaro@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 17:23:09 -03:00
Ihor Solodrai
9b3c11a867 selftests/sched_ext: add order-only dependency of runner.o on BPFOBJ
The runner.o may start building before libbpf headers are installed,
and as a result build fails. This happened a couple of times on
libbpf/ci test jobs:
  * https://github.com/libbpf/ci/actions/runs/11447667257/job/31849533100
  * https://github.com/theihor/libbpf-ci/actions/runs/11445162764/job/31841649552

Headers are installed in a recipe for $(BPFOBJ) target, and adding an
order-only dependency should ensure this doesn't happen.

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Signed-off-by: Tejun Heo <tj@kernel.org>
2024-10-23 08:56:32 -10:00
Arnaldo Carvalho de Melo
d822ca29a4 tools headers UAPI: Sync kvm headers with the kernel sources
To pick the changes in:

  aa8d1f48d3 ("KVM: x86/mmu: Introduce a quirk to control memslot zap behavior")

That don't change functionality in tools/perf, as no new ioctl is added
for the 'perf trace' scripts to harvest.

This addresses these perf build warnings:

  Warning: Kernel ABI header differences:
    diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h

Please see tools/include/uapi/README for further details.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/lkml/ZxgN0O02YrAJ2qIC@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 11:34:56 -03:00
Jiri Slaby
5d35634ecc perf trace: Fix non-listed archs in the syscalltbl routines
This fixes a build breakage on 32-bit arm, where the
syscalltbl__id_at_idx() function was missing.

Committer notes:

Generating a proper syscall table from a copy of
arch/arm/tools/syscall.tbl ends up being too big a patch for this rc
stage, I started doing it but while testing noticed some other problems
with using BPF to collect pointer args on arm7 (32-bit) will maybe
continue trying to make it work on the next cycle...

Fixes: 7a2fb5619c ("perf trace: Fix iteration of syscall ids in syscalltbl->entries")
Suggested-by: Howard Chu <howardchu95@gmail.com>
Signed-off-by: <jslaby@suse.cz>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/lkml/3a592835-a14f-40be-8961-c0cee7720a94@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 11:34:56 -03:00
Howard Chu
7fbff3c0e0 perf build: Change the clang check back to 12.0.1
This serves as a revert for this patch:

  https://lore.kernel.org/linux-perf-users/ZuGL9ROeTV2uXoSp@x1/

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241011021403.4089793-2-howardchu95@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 11:34:56 -03:00
Howard Chu
395d38419f perf trace augmented_raw_syscalls: Add more checks to pass the verifier
Add some more checks to pass the verifier in more kernels.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241011021403.4089793-3-howardchu95@gmail.com
[ Reduced the patch removing things that can be done later ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 11:34:56 -03:00
Arnaldo Carvalho de Melo
ecabac70ff perf trace augmented_raw_syscalls: Add extra array index bounds checking to satisfy some BPF verifiers
In a RHEL8 kernel (4.18.0-513.11.1.el8_9.x86_64), that, as enterprise
kernels go, have backports from modern kernels, the verifier complains
about lack of bounds check for the index into the array of syscall
arguments, on a BPF bytecode generated by clang 17, with:

  ; } else if (size < 0 && size >= -6) { /* buffer */
  116: (b7) r1 = -6
  117: (2d) if r1 > r6 goto pc-30
   R0=map_value(id=0,off=0,ks=4,vs=24688,imm=0) R1_w=inv-6 R2=map_value(id=0,off=16,ks=4,vs=8272,imm=0) R3=inv(id=0) R5=inv40 R6=inv(id=0,umin_value=18446744073709551610,var_off=(0xffffffff00000000; 0xffffffff)) R7=map_value(id=0,off=56,ks=4,vs=8272,imm=0) R8=invP6 R9=map_value(id=0,off=20,ks=4,vs=24,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=map_value fp-24=map_value fp-32=inv40 fp-40=ctx fp-48=map_value fp-56=inv1 fp-64=map_value fp-72=map_value fp-80=map_value
  ; index = -(size + 1);
  118: (a7) r6 ^= -1
  119: (67) r6 <<= 32
  120: (77) r6 >>= 32
  ; aug_size = args->args[index];
  121: (67) r6 <<= 3
  122: (79) r1 = *(u64 *)(r10 -24)
  123: (0f) r1 += r6
  last_idx 123 first_idx 116
  regs=40 stack=0 before 122: (79) r1 = *(u64 *)(r10 -24)
  regs=40 stack=0 before 121: (67) r6 <<= 3
  regs=40 stack=0 before 120: (77) r6 >>= 32
  regs=40 stack=0 before 119: (67) r6 <<= 32
  regs=40 stack=0 before 118: (a7) r6 ^= -1
  regs=40 stack=0 before 117: (2d) if r1 > r6 goto pc-30
  regs=42 stack=0 before 116: (b7) r1 = -6
   R0_w=map_value(id=0,off=0,ks=4,vs=24688,imm=0) R1_w=inv1 R2_w=map_value(id=0,off=16,ks=4,vs=8272,imm=0) R3_w=inv(id=0) R5_w=inv40 R6_rw=invP(id=0,smin_value=-2147483648,smax_value=0) R7_w=map_value(id=0,off=56,ks=4,vs=8272,imm=0) R8_w=invP6 R9_w=map_value(id=0,off=20,ks=4,vs=24,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16_w=map_value fp-24_r=map_value fp-32_w=inv40 fp-40=ctx fp-48=map_value fp-56_w=inv1 fp-64_w=map_value fp-72=map_value fp-80=map_value
  parent didn't have regs=40 stack=0 marks
  last_idx 110 first_idx 98
  regs=40 stack=0 before 110: (6d) if r1 s> r6 goto pc+5
  regs=42 stack=0 before 109: (b7) r1 = 1
  regs=40 stack=0 before 108: (65) if r6 s> 0x1000 goto pc+7
  regs=40 stack=0 before 98: (55) if r6 != 0x1 goto pc+9
   R0_w=map_value(id=0,off=0,ks=4,vs=24688,imm=0) R1_w=invP12 R2_w=map_value(id=0,off=16,ks=4,vs=8272,imm=0) R3_rw=inv(id=0) R5_w=inv24 R6_rw=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R7_w=map_value(id=0,off=40,ks=4,vs=8272,imm=0) R8_rw=invP4 R9_w=map_value(id=0,off=12,ks=4,vs=24,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16_rw=map_value fp-24_r=map_value fp-32_rw=invP24 fp-40_r=ctx fp-48_r=map_value fp-56_w=invP1 fp-64_rw=map_value fp-72_r=map_value fp-80_r=map_value
  parent already had regs=40 stack=0 marks
  124: (79) r6 = *(u64 *)(r1 +16)
   R0=map_value(id=0,off=0,ks=4,vs=24688,imm=0) R1_w=map_value(id=0,off=0,ks=4,vs=8272,umax_value=34359738360,var_off=(0x0; 0x7fffffff8),s32_max_value=2147483640,u32_max_value=-8) R2=map_value(id=0,off=16,ks=4,vs=8272,imm=0) R3=inv(id=0) R5=inv40 R6_w=invP(id=0,umax_value=34359738360,var_off=(0x0; 0x7fffffff8),s32_max_value=2147483640,u32_max_value=-8) R7=map_value(id=0,off=56,ks=4,vs=8272,imm=0) R8=invP6 R9=map_value(id=0,off=20,ks=4,vs=24,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=map_value fp-24=map_value fp-32=inv40 fp-40=ctx fp-48=map_value fp-56=inv1 fp-64=map_value fp-72=map_value fp-80=map_value
  R1 unbounded memory access, make sure to bounds check any such access
  processed 466 insns (limit 1000000) max_states_per_insn 2 total_states 20 peak_states 20 mark_read 3

If we add this line, as used in other BPF programs, to cap that index:

   index &= 7;

The generated BPF program is considered safe by that version of the BPF
verifier, allowing perf to collect the syscall args in one more kernel
using the BPF based pointer contents collector.

With the above one-liner it works with that kernel:

  [root@dell-per740-01 ~]# uname -a
  Linux dell-per740-01.khw.eng.rdu2.dc.redhat.com 4.18.0-513.11.1.el8_9.x86_64 #1 SMP Thu Dec 7 03:06:13 EST 2023 x86_64 x86_64 x86_64 GNU/Linux
  [root@dell-per740-01 ~]# ~acme/bin/perf trace -e *sleep* sleep 1.234567890
       0.000 (1234.704 ms): sleep/3863610 nanosleep(rqtp: { .tv_sec: 1, .tv_nsec: 234567890 })                  = 0
  [root@dell-per740-01 ~]#

As well as with the one in Fedora 40:

  root@number:~# uname -a
  Linux number 6.11.3-200.fc40.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Oct 10 22:31:19 UTC 2024 x86_64 GNU/Linux
  root@number:~# perf trace -e *sleep* sleep 1.234567890
       0.000 (1234.722 ms): sleep/14873 clock_nanosleep(rqtp: { .tv_sec: 1, .tv_nsec: 234567890 }, rmtp: 0x7ffe87311a40) = 0
  root@number:~#

Song Liu reported that this one-liner was being optimized out by clang
18, so I suggested and he tested that adding a compiler barrier before
it made clang v18 to keep it and the verifier in the kernel in Song's
case (Meta's 5.12 based kernel) also was happy with the resulting
bytecode.

I'll investigate using virtme-ng[1] to have all the perf BPF based
functionality thoroughly tested over multiple kernels and clang
versions.

[1] https://kernel-recipes.org/en/2024/virtme-ng/

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrea Righi <andrea.righi@linux.dev>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/lkml/Zw7JgJc0LOwSpuvx@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-23 11:34:56 -03:00
Pei Xiao
2b059d0d1e slub/kunit: fix a WARNING due to unwrapped __kmalloc_cache_noprof
'modprobe slub_kunit' will have a warning as shown below. The root cause
is that __kmalloc_cache_noprof was directly used, which resulted in no
alloc_tag being allocated. This caused current->alloc_tag to be null,
leading to a warning in alloc_tag_add_check.

Let's add an alloc_hook layer to __kmalloc_cache_noprof specifically
within lib/slub_kunit.c, which is the only user of this internal slub
function outside kmalloc implementation itself.

[58162.947016] WARNING: CPU: 2 PID: 6210 at
./include/linux/alloc_tag.h:125 alloc_tagging_slab_alloc_hook+0x268/0x27c
[58162.957721] Call trace:
[58162.957919]  alloc_tagging_slab_alloc_hook+0x268/0x27c
[58162.958286]  __kmalloc_cache_noprof+0x14c/0x344
[58162.958615]  test_kmalloc_redzone_access+0x50/0x10c [slub_kunit]
[58162.959045]  kunit_try_run_case+0x74/0x184 [kunit]
[58162.959401]  kunit_generic_run_threadfn_adapter+0x2c/0x4c [kunit]
[58162.959841]  kthread+0x10c/0x118
[58162.960093]  ret_from_fork+0x10/0x20
[58162.960363] ---[ end trace 0000000000000000 ]---

Signed-off-by: Pei Xiao <xiaopei01@kylinos.cn>
Fixes: a0a44d9175 ("mm, slab: don't wrap internal functions with alloc_hooks()")
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
2024-10-23 09:50:58 +02:00
Arnaldo Carvalho de Melo
39c6a35620 perf trace: The return from 'write' isn't a pid
When adding a explicit beautifier for the 'write' syscall when the BPF
based buffer collector was introduced there was a cut'n'paste error that
carried the syscall_fmt->errpid setting from a nearby syscall (waitid)
that returns a pid.

So the write return was being suppressed by the return pretty printer,
remove that field, reverting it back to the default return handler, that
prints positive numbers as-is and interpret negative values as errnos.

I actually introduced the problem while making Howard's original patch
work just with the 'write' syscall, as we couldn't just look for any
buffers, the ones that are filled in by the kernel couldn't use the same
sys_enter BPF collector.

Fixes: b257fac12f ("perf trace: Pretty print buffer data")
Reported-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/lkml/bcf50648-3c7e-4513-8717-0d14492c53b9@linaro.org
Link: https://lore.kernel.org/all/Zt8jTfzDYgBPvFCd@x1/#t
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-17 10:34:43 -03:00
Arnaldo Carvalho de Melo
ab8aaab874 tools headers UAPI: Sync linux/const.h with the kernel headers
To pick up the changes in:

  947697c6f0 ("uapi: Define GENMASK_U128")

That causes no changes in tooling, just addresses this perf build
warning:

  Warning: Kernel ABI header differences:
    diff -u tools/include/uapi/linux/const.h include/uapi/linux/const.h

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/lkml/ZwltGNJwujKu1Fgn@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-10-17 10:34:43 -03:00
Xiu Jianfeng
3cc4e13bb1 cgroup: Fix potential overflow issue when checking max_depth
cgroup.max.depth is the maximum allowed descent depth below the current
cgroup. If the actual descent depth is equal or larger, an attempt to
create a new child cgroup will fail. However due to the cgroup->max_depth
is of int type and having the default value INT_MAX, the condition
'level > cgroup->max_depth' will never be satisfied, and it will cause
an overflow of the level after it reaches to INT_MAX.

Fix it by starting the level from 0 and using '>=' instead.

It's worth mentioning that this issue is unlikely to occur in reality,
as it's impossible to have a depth of INT_MAX hierarchy, but should be
be avoided logically.

Fixes: 1a926e0bba ("cgroup: implement hierarchy limits")
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2024-10-14 13:39:25 -10:00
Chen Ridong
117932eea9 cgroup/bpf: use a dedicated workqueue for cgroup bpf destruction
A hung_task problem shown below was found:

INFO: task kworker/0:0:8 blocked for more than 327 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Workqueue: events cgroup_bpf_release
Call Trace:
 <TASK>
 __schedule+0x5a2/0x2050
 ? find_held_lock+0x33/0x100
 ? wq_worker_sleeping+0x9e/0xe0
 schedule+0x9f/0x180
 schedule_preempt_disabled+0x25/0x50
 __mutex_lock+0x512/0x740
 ? cgroup_bpf_release+0x1e/0x4d0
 ? cgroup_bpf_release+0xcf/0x4d0
 ? process_scheduled_works+0x161/0x8a0
 ? cgroup_bpf_release+0x1e/0x4d0
 ? mutex_lock_nested+0x2b/0x40
 ? __pfx_delay_tsc+0x10/0x10
 mutex_lock_nested+0x2b/0x40
 cgroup_bpf_release+0xcf/0x4d0
 ? process_scheduled_works+0x161/0x8a0
 ? trace_event_raw_event_workqueue_execute_start+0x64/0xd0
 ? process_scheduled_works+0x161/0x8a0
 process_scheduled_works+0x23a/0x8a0
 worker_thread+0x231/0x5b0
 ? __pfx_worker_thread+0x10/0x10
 kthread+0x14d/0x1c0
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x59/0x70
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1b/0x30
 </TASK>

This issue can be reproduced by the following pressuse test:
1. A large number of cpuset cgroups are deleted.
2. Set cpu on and off repeatly.
3. Set watchdog_thresh repeatly.
The scripts can be obtained at LINK mentioned above the signature.

The reason for this issue is cgroup_mutex and cpu_hotplug_lock are
acquired in different tasks, which may lead to deadlock.
It can lead to a deadlock through the following steps:
1. A large number of cpusets are deleted asynchronously, which puts a
   large number of cgroup_bpf_release works into system_wq. The max_active
   of system_wq is WQ_DFL_ACTIVE(256). Consequently, all active works are
   cgroup_bpf_release works, and many cgroup_bpf_release works will be put
   into inactive queue. As illustrated in the diagram, there are 256 (in
   the acvtive queue) + n (in the inactive queue) works.
2. Setting watchdog_thresh will hold cpu_hotplug_lock.read and put
   smp_call_on_cpu work into system_wq. However step 1 has already filled
   system_wq, 'sscs.work' is put into inactive queue. 'sscs.work' has
   to wait until the works that were put into the inacvtive queue earlier
   have executed (n cgroup_bpf_release), so it will be blocked for a while.
3. Cpu offline requires cpu_hotplug_lock.write, which is blocked by step 2.
4. Cpusets that were deleted at step 1 put cgroup_release works into
   cgroup_destroy_wq. They are competing to get cgroup_mutex all the time.
   When cgroup_metux is acqured by work at css_killed_work_fn, it will
   call cpuset_css_offline, which needs to acqure cpu_hotplug_lock.read.
   However, cpuset_css_offline will be blocked for step 3.
5. At this moment, there are 256 works in active queue that are
   cgroup_bpf_release, they are attempting to acquire cgroup_mutex, and as
   a result, all of them are blocked. Consequently, sscs.work can not be
   executed. Ultimately, this situation leads to four processes being
   blocked, forming a deadlock.

system_wq(step1)		WatchDog(step2)			cpu offline(step3)	cgroup_destroy_wq(step4)
...
2000+ cgroups deleted asyn
256 actives + n inactives
				__lockup_detector_reconfigure
				P(cpu_hotplug_lock.read)
				put sscs.work into system_wq
256 + n + 1(sscs.work)
sscs.work wait to be executed
				warting sscs.work finish
								percpu_down_write
								P(cpu_hotplug_lock.write)
								...blocking...
											css_killed_work_fn
											P(cgroup_mutex)
											cpuset_css_offline
											P(cpu_hotplug_lock.read)
											...blocking...
256 cgroup_bpf_release
mutex_lock(&cgroup_mutex);
..blocking...

To fix the problem, place cgroup_bpf_release works on a dedicated
workqueue which can break the loop and solve the problem. System wqs are
for misc things which shouldn't create a large number of concurrent work
items. If something is going to generate >WQ_DFL_ACTIVE(256) concurrent
work items, it should use its own dedicated workqueue.

Fixes: 4bfc0bb2c6 ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself")
Cc: stable@vger.kernel.org # v5.3+
Link: https://lore.kernel.org/cgroups/e90c32d2-2a85-4f28-9154-09c7d320cb60@huawei.com/T/#t
Tested-by: Vishal Chourasia <vishalc@linux.ibm.com>
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2024-10-08 08:43:22 -10:00
Tyler
504eaa2570
Update README 2024-08-25 19:11:16 +01:00
76 changed files with 776 additions and 439 deletions

View file

@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime.
A similar feature already exists in the XNU kernel with the
VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].
User API
========
mseal()
-----------
The mseal() syscall has the following signature:
SYSCALL
=======
mseal syscall signature
-----------------------
``int mseal(void \* addr, size_t len, unsigned long flags)``
``int mseal(void addr, size_t len, unsigned long flags)``
**addr**/**len**: virtual memory address range.
The address range set by **addr**/**len** must meet:
- The start address must be in an allocated VMA.
- The start address must be page aligned.
- The end address (**addr** + **len**) must be in an allocated VMA.
- no gap (unallocated memory) between start and end address.
**addr/len**: virtual memory address range.
The ``len`` will be paged aligned implicitly by the kernel.
The address range set by ``addr``/``len`` must meet:
- The start address must be in an allocated VMA.
- The start address must be page aligned.
- The end address (``addr`` + ``len``) must be in an allocated VMA.
- no gap (unallocated memory) between start and end address.
**flags**: reserved for future use.
The ``len`` will be paged aligned implicitly by the kernel.
**Return values**:
- **0**: Success.
- **-EINVAL**:
* Invalid input ``flags``.
* The start address (``addr``) is not page aligned.
* Address range (``addr`` + ``len``) overflow.
- **-ENOMEM**:
* The start address (``addr``) is not allocated.
* The end address (``addr`` + ``len``) is not allocated.
* A gap (unallocated memory) between start and end address.
- **-EPERM**:
* sealing is supported only on 64-bit CPUs, 32-bit is not supported.
**flags**: reserved for future use.
**Note about error return**:
- For above error cases, users can expect the given memory range is
unmodified, i.e. no partial update.
- There might be other internal errors/cases not listed here, e.g.
error during merging/splitting VMAs, or the process reaching the maximum
number of supported VMAs. In those cases, partial updates to the given
memory range could happen. However, those cases should be rare.
**return values**:
**Architecture support**:
mseal only works on 64-bit CPUs, not 32-bit CPUs.
- ``0``: Success.
**Idempotent**:
users can call mseal multiple times. mseal on an already sealed memory
is a no-action (not error).
- ``-EINVAL``:
- Invalid input ``flags``.
- The start address (``addr``) is not page aligned.
- Address range (``addr`` + ``len``) overflow.
**no munseal**
Once mapping is sealed, it can't be unsealed. The kernel should never
have munseal, this is consistent with other sealing feature, e.g.
F_SEAL_SEAL for file.
- ``-ENOMEM``:
- The start address (``addr``) is not allocated.
- The end address (``addr`` + ``len``) is not allocated.
- A gap (unallocated memory) between start and end address.
Blocked mm syscall for sealed mapping
-------------------------------------
It might be important to note: **once the mapping is sealed, it will
stay in the process's memory until the process terminates**.
- ``-EPERM``:
- sealing is supported only on 64-bit CPUs, 32-bit is not supported.
Example::
- For above error cases, users can expect the given memory range is
unmodified, i.e. no partial update.
*ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
rc = mseal(ptr, 4096, 0);
/* munmap will fail */
rc = munmap(ptr, 4096);
assert(rc < 0);
- There might be other internal errors/cases not listed here, e.g.
error during merging/splitting VMAs, or the process reaching the max
number of supported VMAs. In those cases, partial updates to the given
memory range could happen. However, those cases should be rare.
Blocked mm syscall:
- munmap
- mmap
- mremap
- mprotect and pkey_mprotect
- some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE,
MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK
**Blocked operations after sealing**:
Unmapping, moving to another location, and shrinking the size,
via munmap() and mremap(), can leave an empty space, therefore
can be replaced with a VMA with a new set of attributes.
The first set of syscalls to block is munmap, mremap, mmap. They can
either leave an empty space in the address space, therefore allowing
replacement with a new mapping with new set of attributes, or can
overwrite the existing mapping with another mapping.
Moving or expanding a different VMA into the current location,
via mremap().
mprotect and pkey_mprotect are blocked because they changes the
protection bits (RWX) of the mapping.
Modifying a VMA via mmap(MAP_FIXED).
Certain destructive madvise behaviors, specifically MADV_DONTNEED,
MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce
risks when applied to anonymous memory by threads lacking write
permissions. Consequently, these operations are prohibited under such
conditions. The aforementioned behaviors have the potential to modify
region contents by discarding pages, effectively performing a memset(0)
operation on the anonymous memory.
Size expansion, via mremap(), does not appear to pose any
specific risks to sealed VMAs. It is included anyway because
the use case is unclear. In any case, users can rely on
merging to expand a sealed VMA.
Kernel will return -EPERM for blocked syscalls.
mprotect() and pkey_mprotect().
When blocked syscall return -EPERM due to sealing, the memory regions may
or may not be changed, depends on the syscall being blocked:
Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
for anonymous memory, when users don't have write permission to the
memory. Those behaviors can alter region contents by discarding pages,
effectively a memset(0) for anonymous memory.
- munmap: munmap is atomic. If one of VMAs in the given range is
sealed, none of VMAs are updated.
- mprotect, pkey_mprotect, madvise: partial update might happen, e.g.
when mprotect over multiple VMAs, mprotect might update the beginning
VMAs before reaching the sealed VMA and return -EPERM.
- mmap and mremap: undefined behavior.
Kernel will return -EPERM for blocked operations.
For blocked operations, one can expect the given address is unmodified,
i.e. no partial update. Note, this is different from existing mm
system call behaviors, where partial updates are made till an error is
found and returned to userspace. To give an example:
Assume following code sequence:
- ptr = mmap(null, 8192, PROT_NONE);
- munmap(ptr + 4096, 4096);
- ret1 = mprotect(ptr, 8192, PROT_READ);
- mseal(ptr, 4096);
- ret2 = mprotect(ptr, 8192, PROT_NONE);
ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
ret2 will be -EPERM, the page remains to be PROT_READ.
**Note**:
- mseal() only works on 64-bit CPUs, not 32-bit CPU.
- users can call mseal() multiple times, mseal() on an already sealed memory
is a no-action (not error).
- munseal() is not supported.
Use cases:
==========
Use cases
=========
- glibc:
The dynamic linker, during loading ELF executables, can apply sealing to
non-writable memory segments.
mapping segments.
- Chrome browser: protect some security sensitive data-structures.
- Chrome browser: protect some security sensitive data structures.
Notes on which memory to seal:
==============================
It might be important to note that sealing changes the lifetime of a mapping,
i.e. the sealed mapping wont be unmapped till the process terminates or the
exec system call is invoked. Applications can apply sealing to any virtual
memory region from userspace, but it is crucial to thoroughly analyze the
mapping's lifetime prior to apply the sealing.
When not to use mseal
=====================
Applications can apply sealing to any virtual memory region from userspace,
but it is *crucial to thoroughly analyze the mapping's lifetime* prior to
apply the sealing. This is because the sealed mapping *wont be unmapped*
until the process terminates or the exec system call is invoked.
For example:
- aio/shm
aio/shm can call mmap and munmap on behalf of userspace, e.g.
ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to
the lifetime of the process. If those memories are sealed from userspace,
then munmap will fail, causing leaks in VMA address space during the
lifetime of the process.
- aio/shm
- ptr allocated by malloc (heap)
Don't use mseal on the memory ptr return from malloc().
malloc() is implemented by allocator, e.g. by glibc. Heap manager might
allocate a ptr from brk or mapping created by mmap.
If an app calls mseal on a ptr returned from malloc(), this can affect
the heap manager's ability to manage the mappings; the outcome is
non-deterministic.
aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
shm.c. The lifetime of those mapping are not tied to the lifetime of the
process. If those memories are sealed from userspace, then munmap() will fail,
causing leaks in VMA address space during the lifetime of the process.
Example::
- Brk (heap)
ptr = malloc(size);
/* don't call mseal on ptr return from malloc. */
mseal(ptr, size);
/* free will success, allocator can't shrink heap lower than ptr */
free(ptr);
Currently, userspace applications can seal parts of the heap by calling
malloc() and mseal().
let's assume following calls from user space:
mseal doesn't block
===================
In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's
attributes, such as protection bits (RWX). Sealed mappings doesn't mean the
memory is immutable.
- ptr = malloc(size);
- mprotect(ptr, size, RO);
- mseal(ptr, size);
- free(ptr);
Technically, before mseal() is added, the user can change the protection of
the heap by calling mprotect(RO). As long as the user changes the protection
back to RW before free(), the memory range can be reused.
Adding mseal() into the picture, however, the heap is then sealed partially,
the user can still free it, but the memory remains to be RO. If the address
is re-used by the heap manager for another malloc, the process might crash
soon after. Therefore, it is important not to apply sealing to any memory
that might get recycled.
Furthermore, even if the application never calls the free() for the ptr,
the heap manager may invoke the brk system call to shrink the size of the
heap. In the kernel, the brk-shrink will call munmap(). Consequently,
depending on the location of the ptr, the outcome of brk-shrink is
nondeterministic.
Additional notes:
=================
As Jann Horn pointed out in [3], there are still a few ways to write
to RO memory, which is, in a way, by design. Those cases are not covered
by mseal(). If applications want to block such cases, sandbox tools (such as
seccomp, LSM, etc) might be considered.
to RO memory, which is, in a way, by design. And those could be blocked
by different security measures.
Those cases are:
- Write to read-only memory through /proc/self/mem interface.
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
- userfaultfd.
- Write to read-only memory through /proc/self/mem interface (FOLL_FORCE).
- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
- userfaultfd.
The idea that inspired this patch comes from Stephen Röttgers work in V8
CFI [4]. Chrome browser in ChromeOS will be the first user of this API.
Reference:
==========
[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
[2] https://man.openbsd.org/mimmutable.2
[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
Reference
=========
- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
- [2] https://man.openbsd.org/mimmutable.2
- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc

4
README
View file

@ -11,8 +11,10 @@ In order to build the documentation, use ``make htmldocs`` or
https://www.kernel.org/doc/html/latest/
There are various text files in the Documentation/ subdirectory,
several of them using the reStructuredText markup notation.
several of them using reStructuredText markup notation.
Please read the Documentation/process/changes.rst file, as it contains the
requirements for building and running the kernel, and information about
the problems which may result by upgrading your kernel.

View file

@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs)
int ud_type;
u32 imm;
/*
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
* is a rare case that uses @regs without passing them to
* irqentry_enter().
*/
kmsan_unpoison_entry_regs(regs);
ud_type = decode_bug(regs->ip, &imm);
if (ud_type == BUG_NONE)
return handled;
@ -275,6 +269,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
* All lies, just get the WARN/BUG out.
*/
instrumentation_begin();
/*
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
* is a rare case that uses @regs without passing them to
* irqentry_enter().
*/
kmsan_unpoison_entry_regs(regs);
/*
* Since we're emulating a CALL with exceptions, restore the interrupt
* state to what it was at the exception site.

View file

@ -674,6 +674,16 @@ EXPORT_SYMBOL_GPL(tpm_chip_register);
*/
void tpm_chip_unregister(struct tpm_chip *chip)
{
#ifdef CONFIG_TCG_TPM2_HMAC
int rc;
rc = tpm_try_get_ops(chip);
if (!rc) {
tpm2_end_auth_session(chip);
tpm_put_ops(chip);
}
#endif
tpm_del_legacy_sysfs(chip);
if (tpm_is_hwrng_enabled(chip))
hwrng_unregister(&chip->hwrng);

View file

@ -27,6 +27,9 @@ static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space,
struct tpm_header *header = (void *)buf;
ssize_t ret, len;
if (chip->flags & TPM_CHIP_FLAG_TPM2)
tpm2_end_auth_session(chip);
ret = tpm2_prepare_space(chip, space, buf, bufsiz);
/* If the command is not implemented by the TPM, synthesize a
* response with a TPM2_RC_COMMAND_CODE return for user-space.

View file

@ -379,10 +379,12 @@ int tpm_pm_suspend(struct device *dev)
rc = tpm_try_get_ops(chip);
if (!rc) {
if (chip->flags & TPM_CHIP_FLAG_TPM2)
if (chip->flags & TPM_CHIP_FLAG_TPM2) {
tpm2_end_auth_session(chip);
tpm2_shutdown(chip, TPM2_SU_STATE);
else
} else {
rc = tpm1_pm_suspend(chip, tpm_suspend_pcr);
}
tpm_put_ops(chip);
}

View file

@ -333,6 +333,9 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
}
#ifdef CONFIG_TCG_TPM2_HMAC
/* The first write to /dev/tpm{rm0} will flush the session. */
attributes |= TPM2_SA_CONTINUE_SESSION;
/*
* The Architecture Guide requires us to strip trailing zeros
* before computing the HMAC
@ -484,7 +487,8 @@ static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v,
sha256_final(&sctx, out);
}
static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip,
struct tpm2_auth *auth)
{
struct crypto_kpp *kpp;
struct kpp_request *req;
@ -543,7 +547,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
sg_set_buf(&s[0], chip->null_ec_key_x, EC_PT_SZ);
sg_set_buf(&s[1], chip->null_ec_key_y, EC_PT_SZ);
kpp_request_set_input(req, s, EC_PT_SZ*2);
sg_init_one(d, chip->auth->salt, EC_PT_SZ);
sg_init_one(d, auth->salt, EC_PT_SZ);
kpp_request_set_output(req, d, EC_PT_SZ);
crypto_kpp_compute_shared_secret(req);
kpp_request_free(req);
@ -554,8 +558,7 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
* This works because KDFe fully consumes the secret before it
* writes the salt
*/
tpm2_KDFe(chip->auth->salt, "SECRET", x, chip->null_ec_key_x,
chip->auth->salt);
tpm2_KDFe(auth->salt, "SECRET", x, chip->null_ec_key_x, auth->salt);
out:
crypto_free_kpp(kpp);
@ -853,7 +856,9 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
if (rc)
/* manually close the session if it wasn't consumed */
tpm2_flush_context(chip, auth->handle);
memzero_explicit(auth, sizeof(*auth));
kfree_sensitive(auth);
chip->auth = NULL;
} else {
/* reset for next use */
auth->session = TPM_HEADER_SIZE;
@ -881,7 +886,8 @@ void tpm2_end_auth_session(struct tpm_chip *chip)
return;
tpm2_flush_context(chip, auth->handle);
memzero_explicit(auth, sizeof(*auth));
kfree_sensitive(auth);
chip->auth = NULL;
}
EXPORT_SYMBOL(tpm2_end_auth_session);
@ -915,33 +921,37 @@ static int tpm2_parse_start_auth_session(struct tpm2_auth *auth,
static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
{
int rc;
unsigned int offset = 0; /* dummy offset for null seed context */
u8 name[SHA256_DIGEST_SIZE + 2];
u32 tmp_null_key;
int rc;
rc = tpm2_load_context(chip, chip->null_key_context, &offset,
null_key);
if (rc != -EINVAL)
return rc;
&tmp_null_key);
if (rc != -EINVAL) {
if (!rc)
*null_key = tmp_null_key;
goto err;
}
/* an integrity failure may mean the TPM has been reset */
dev_err(&chip->dev, "NULL key integrity failure!\n");
/* check the null name against what we know */
tpm2_create_primary(chip, TPM2_RH_NULL, NULL, name);
if (memcmp(name, chip->null_key_name, sizeof(name)) == 0)
/* name unchanged, assume transient integrity failure */
return rc;
/*
* Fatal TPM failure: the NULL seed has actually changed, so
* the TPM must have been illegally reset. All in-kernel TPM
* operations will fail because the NULL primary can't be
* loaded to salt the sessions, but disable the TPM anyway so
* userspace programmes can't be compromised by it.
*/
dev_err(&chip->dev, "NULL name has changed, disabling TPM due to interference\n");
/* Try to re-create null key, given the integrity failure: */
rc = tpm2_create_primary(chip, TPM2_RH_NULL, &tmp_null_key, name);
if (rc)
goto err;
/* Return null key if the name has not been changed: */
if (!memcmp(name, chip->null_key_name, sizeof(name))) {
*null_key = tmp_null_key;
return 0;
}
/* Deduce from the name change TPM interference: */
dev_err(&chip->dev, "null key integrity check failed\n");
tpm2_flush_context(chip, tmp_null_key);
chip->flags |= TPM_CHIP_FLAG_DISABLE;
return rc;
err:
return rc ? -ENODEV : 0;
}
/**
@ -958,16 +968,20 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
*/
int tpm2_start_auth_session(struct tpm_chip *chip)
{
struct tpm2_auth *auth;
struct tpm_buf buf;
struct tpm2_auth *auth = chip->auth;
int rc;
u32 null_key;
int rc;
if (!auth) {
dev_warn_once(&chip->dev, "auth session is not active\n");
if (chip->auth) {
dev_warn_once(&chip->dev, "auth session is active\n");
return 0;
}
auth = kzalloc(sizeof(*auth), GFP_KERNEL);
if (!auth)
return -ENOMEM;
rc = tpm2_load_null(chip, &null_key);
if (rc)
goto out;
@ -988,7 +1002,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
tpm_buf_append(&buf, auth->our_nonce, sizeof(auth->our_nonce));
/* append encrypted salt and squirrel away unencrypted in auth */
tpm_buf_append_salt(&buf, chip);
tpm_buf_append_salt(&buf, chip, auth);
/* session type (HMAC, audit or policy) */
tpm_buf_append_u8(&buf, TPM2_SE_HMAC);
@ -1010,10 +1024,13 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
tpm_buf_destroy(&buf);
if (rc)
goto out;
if (rc == TPM2_RC_SUCCESS) {
chip->auth = auth;
return 0;
}
out:
out:
kfree_sensitive(auth);
return rc;
}
EXPORT_SYMBOL(tpm2_start_auth_session);
@ -1347,18 +1364,21 @@ static int tpm2_create_null_primary(struct tpm_chip *chip)
*
* Derive and context save the null primary and allocate memory in the
* struct tpm_chip for the authorizations.
*
* Return:
* * 0 - OK
* * -errno - A system error
* * TPM_RC - A TPM error
*/
int tpm2_sessions_init(struct tpm_chip *chip)
{
int rc;
rc = tpm2_create_null_primary(chip);
if (rc)
dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
chip->auth = kmalloc(sizeof(*chip->auth), GFP_KERNEL);
if (!chip->auth)
return -ENOMEM;
if (rc) {
dev_err(&chip->dev, "null key creation failed with %d\n", rc);
return rc;
}
return rc;
}

View file

@ -3651,7 +3651,7 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
enum dma_data_direction dir;
struct scsi_data_buffer *sdb = &scp->sdb;
u8 *fsp;
int i;
int i, total = 0;
/*
* Even though reads are inherently atomic (in this driver), we expect
@ -3688,18 +3688,16 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp,
fsp + (block * sdebug_sector_size),
sdebug_sector_size, sg_skip, do_write);
sdeb_data_sector_unlock(sip, do_write);
if (ret != sdebug_sector_size) {
ret += (i * sdebug_sector_size);
total += ret;
if (ret != sdebug_sector_size)
break;
}
sg_skip += sdebug_sector_size;
if (++block >= sdebug_store_sectors)
block = 0;
}
ret = num * sdebug_sector_size;
sdeb_data_unlock(sip, atomic);
return ret;
return total;
}
/* Returns number of bytes copied or -1 if error. */

View file

@ -8219,7 +8219,7 @@ static void ufshcd_update_rtc(struct ufs_hba *hba)
err = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR, QUERY_ATTR_IDN_SECONDS_PASSED,
0, 0, &val);
ufshcd_rpm_put_sync(hba);
ufshcd_rpm_put(hba);
if (err)
dev_err(hba->dev, "%s: Failed to update rtc %d\n", __func__, err);

View file

@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {

View file

@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
if (byte_start > id_count || byte_start + byte_len > id_count) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
}
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
byte_start + byte_len, 0);
if (ret) {

View file

@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
}
}
void dup_userfaultfd_fail(struct list_head *fcs)
{
struct userfaultfd_fork_ctx *fctx, *n;
/*
* An error has occurred on fork, we will tear memory down, but have
* allocated memory for fctx's and raised reference counts for both the
* original and child contexts (and on the mm for each as a result).
*
* These would ordinarily be taken care of by a user handling the event,
* but we are no longer doing so, so manually clean up here.
*
* mm tear down will take care of cleaning up VMA contexts.
*/
list_for_each_entry_safe(fctx, n, fcs, list) {
struct userfaultfd_ctx *octx = fctx->orig;
struct userfaultfd_ctx *ctx = fctx->new;
atomic_dec(&octx->mmap_changing);
VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
userfaultfd_ctx_put(octx);
userfaultfd_ctx_put(ctx);
list_del(&fctx->list);
kfree(fctx);
}
}
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{

View file

@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
return atomic_long_read(&mm->ksm_zero_pages);
}
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
/* Adding mm to ksm is best effort on fork. */
if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
return __ksm_enter(mm);
return 0;
__ksm_enter(mm);
}
static inline int ksm_execve(struct mm_struct *mm)
@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm)
return 0;
}
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
return 0;
}
static inline int ksm_execve(struct mm_struct *mm)

View file

@ -38,6 +38,7 @@
#else
#define can_do_masked_user_access() 0
#define masked_user_access_begin(src) NULL
#define mask_user_address(src) (src)
#endif
/*
@ -159,19 +160,27 @@ _inline_copy_from_user(void *to, const void __user *from, unsigned long n)
{
unsigned long res = n;
might_fault();
if (!should_fail_usercopy() && likely(access_ok(from, n))) {
if (should_fail_usercopy())
goto fail;
if (can_do_masked_user_access())
from = mask_user_address(from);
else {
if (!access_ok(from, n))
goto fail;
/*
* Ensure that bad access_ok() speculation will not
* lead to nasty side effects *after* the copy is
* finished:
*/
barrier_nospec();
instrument_copy_from_user_before(to, from, n);
res = raw_copy_from_user(to, from, n);
instrument_copy_from_user_after(to, from, n, res);
}
if (unlikely(res))
memset(to + (n - res), 0, res);
instrument_copy_from_user_before(to, from, n);
res = raw_copy_from_user(to, from, n);
instrument_copy_from_user_after(to, from, n, res);
if (likely(!res))
return 0;
fail:
memset(to + (n - res), 0, res);
return res;
}
extern __must_check unsigned long

View file

@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
extern void dup_userfaultfd_complete(struct list_head *);
void dup_userfaultfd_fail(struct list_head *);
extern void mremap_userfaultfd_prep(struct vm_area_struct *,
struct vm_userfaultfd_ctx *);
@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
{
}
static inline void dup_userfaultfd_fail(struct list_head *l)
{
}
static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *ctx)
{

View file

@ -24,6 +24,23 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
/*
* cgroup bpf destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_bpf_destroy_wq;
static int __init cgroup_bpf_wq_init(void)
{
cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
if (!cgroup_bpf_destroy_wq)
panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
return 0;
}
core_initcall(cgroup_bpf_wq_init);
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
queue_work(system_wq, &cgrp->bpf.release_work);
queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through

View file

@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
struct cgroup *cgroup;
int ret = false;
int level = 1;
int level = 0;
lockdep_assert_held(&cgroup_mutex);
@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail;
if (level > cgroup->max_depth)
if (level >= cgroup->max_depth)
goto fail;
level++;

View file

@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);
/* Use __mt_dup() to efficiently build an identical maple tree. */
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
if (unlikely(retval))
@ -760,6 +755,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
vma_iter_free(&vmi);
if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
ksm_fork(mm, oldmm);
khugepaged_fork(mm, oldmm);
} else if (mpnt) {
/*
* The entire maple tree has already been duplicated. If the
@ -775,7 +772,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
mmap_write_unlock(oldmm);
dup_userfaultfd_complete(&uf);
if (!retval)
dup_userfaultfd_complete(&uf);
else
dup_userfaultfd_fail(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;

View file

@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
rams_size += 16;
}
rams[i].start = res.start;
rams[i++].end = res.end;
rams[i++] = res;
start = res.end + 1;
}

View file

@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
static int scx_ops_bypass_depth;
static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@ -4298,18 +4299,20 @@ bool task_should_scx(struct task_struct *p)
*/
static void scx_ops_bypass(bool bypass)
{
int depth, cpu;
int cpu;
unsigned long flags;
raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
if (bypass) {
depth = atomic_inc_return(&scx_ops_bypass_depth);
WARN_ON_ONCE(depth <= 0);
if (depth != 1)
return;
scx_ops_bypass_depth++;
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
if (scx_ops_bypass_depth != 1)
goto unlock;
} else {
depth = atomic_dec_return(&scx_ops_bypass_depth);
WARN_ON_ONCE(depth < 0);
if (depth != 0)
return;
scx_ops_bypass_depth--;
WARN_ON_ONCE(scx_ops_bypass_depth < 0);
if (scx_ops_bypass_depth != 0)
goto unlock;
}
/*
@ -4326,7 +4329,7 @@ static void scx_ops_bypass(bool bypass)
struct rq_flags rf;
struct task_struct *p, *n;
rq_lock_irqsave(rq, &rf);
rq_lock(rq, &rf);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@ -4362,11 +4365,13 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
rq_unlock_irqrestore(rq, &rf);
rq_unlock(rq, &rf);
/* resched to restore ticks and idle state */
resched_cpu(cpu);
}
unlock:
raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
}
static void free_exit_info(struct scx_exit_info *ei)

View file

@ -141,7 +141,7 @@ static void test_kmalloc_redzone_access(struct kunit *test)
{
struct kmem_cache *s = test_kmem_cache_create("TestSlub_RZ_kmalloc", 32,
SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE);
u8 *p = __kmalloc_cache_noprof(s, GFP_KERNEL, 18);
u8 *p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 18));
kasan_disable_current();

View file

@ -1085,7 +1085,6 @@ config HMM_MIRROR
depends on MMU
config GET_FREE_REGION
depends on SPARSEMEM
bool
config DEVICE_PRIVATE

View file

@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
if (need_clear_cache)
if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
if (waitqueue_active(&swapcache_wq))
wake_up(&swapcache_wq);
}
if (si)
put_swap_device(si);
return ret;
@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
if (need_clear_cache)
if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
if (waitqueue_active(&swapcache_wq))
wake_up(&swapcache_wq);
}
if (si)
put_swap_device(si);
return ret;

View file

@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vmg.flags = vm_flags;
}
/*
* clear PTEs while the vma is still in the tree so that rmap
* cannot race with the freeing later in the truncate scenario.
* This is also needed for call_mmap(), which is why vm_ops
* close function is called.
*/
vms_clean_up_area(&vms, &mas_detach);
vma = vma_merge_new_range(&vmg);
if (vma)
goto expanded;
@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (file) {
vma->vm_file = get_file(file);
/*
* call_mmap() may map PTE, so ensure there are no existing PTEs
* and call the vm_ops close function if one exists.
*/
vms_clean_up_area(&vms, &mas_detach);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;
vm_flags_t vm_flags;
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
current->comm, current->pid);
@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
if (mmap_write_lock_killable(mm))
if (mmap_read_lock_killable(mm))
return -EINTR;
/*
* Look up VMA under read lock first so we can perform the security
* without holding locks (which can be problematic). We reacquire a
* write lock later and check nothing changed underneath us.
*/
vma = vma_lookup(mm, start);
if (!vma || !(vma->vm_flags & VM_SHARED)) {
mmap_read_unlock(mm);
return -EINVAL;
}
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
/* Save vm_flags used to calculate prot and flags, and recheck later. */
vm_flags = vma->vm_flags;
file = get_file(vma->vm_file);
mmap_read_unlock(mm);
/* Call outside mmap_lock to be consistent with other callers. */
ret = security_mmap_file(file, prot, flags);
if (ret) {
fput(file);
return ret;
}
ret = -EINVAL;
/* OK security check passed, take write lock + let it rip. */
if (mmap_write_lock_killable(mm)) {
fput(file);
return -EINTR;
}
vma = vma_lookup(mm, start);
if (!vma || !(vma->vm_flags & VM_SHARED))
if (!vma)
goto out;
/* Make sure things didn't change under us. */
if (vma->vm_flags != vm_flags)
goto out;
if (vma->vm_file != file)
goto out;
if (start + size > vma->vm_end) {
@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
}
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
file = get_file(vma->vm_file);
ret = security_mmap_file(vma->vm_file, prot, flags);
if (ret)
goto out_fput;
ret = do_mmap(vma->vm_file, start, size,
prot, flags, 0, pgoff, &populate, NULL);
out_fput:
fput(file);
out:
mmap_write_unlock(mm);
fput(file);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
vmg.prev = vma;
vma_iter_next_range(vmi);
/* vmi is positioned at prev, which this mode expects. */
vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
if (vma_merge_new_range(&vmg))
goto out;

View file

@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
for_each_reserved_mem_region(mb_region) {
int nid = memblock_get_region_node(mb_region);
if (nid != MAX_NUMNODES)
if (numa_valid_node(nid))
node_set(nid, reserved_nodemask);
}

View file

@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
page = __rmqueue(zone, order, migratetype, alloc_flags);
/*
* If the allocation fails, allow OOM handling access
* to HIGHATOMIC reserves as failing now is worse than
* failing a high-order atomic allocation in the
* future.
* If the allocation fails, allow OOM handling and
* order-0 (atomic) allocs access to HIGHATOMIC
* reserves as failing now is worse than failing a
* high-order atomic allocation in the future.
*/
if (!page && (alloc_flags & ALLOC_OOM))
if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {

View file

@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw,
pud = pudp_get(pudp);
if (pud_none(pud))
goto not_found;
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
(!pud_present(pud) || pud_leaf(pud))) {
ptl = pud_lock(vma->vm_mm, pudp);
pud = pudp_get(pudp);
@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->pudp = pudp;
fw->pud = pud;
/*
* TODO: FW_MIGRATION support for PUD migration entries
* once there are relevant users.
*/
if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
spin_unlock(ptl);
goto not_found;
@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw,
}
pmd_table:
VM_WARN_ON_ONCE(pud_leaf(*pudp));
VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
pmdp = pmd_offset(pudp, addr);
pmd = pmdp_get_lockless(pmdp);
if (pmd_none(pmd))
goto not_found;
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
(!pmd_present(pmd) || pmd_leaf(pmd))) {
ptl = pmd_lock(vma->vm_mm, pmdp);
pmd = pmdp_get(pmdp);
@ -786,7 +792,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
if (pmd_none(pmd)) {
spin_unlock(ptl);
goto not_found;
} else if (!pmd_leaf(pmd)) {
} else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
spin_unlock(ptl);
goto pte_table;
} else if (pmd_present(pmd)) {
@ -812,7 +818,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
}
pte_table:
VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
if (!ptep)
goto not_found;

View file

@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;

View file

@ -1209,7 +1209,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
memset((void *)p + new_size, 0, ks - new_size);
memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
kasan_enable_current();
}

View file

@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
pgoff_t pgoff = vmg->pgoff;
pgoff_t pglen = PHYS_PFN(end - start);
bool can_merge_left, can_merge_right;
bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
mmap_assert_write_locked(vmg->mm);
VM_WARN_ON(vmg->vma);
@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
return NULL;
can_merge_left = can_vma_merge_left(vmg);
can_merge_right = can_vma_merge_right(vmg, can_merge_left);
can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
if (can_merge_right && !can_merge_remove_vma(next))
vmg->end = end;
vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
/* In expand-only case we are already positioned at prev. */
if (!just_expand) {
/* Equivalent to going to the previous range. */
vma_prev(vmg->vmi);
}
}
/*
@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
}
/* If expansion failed, reset state. Allows us to retry merge later. */
vmg->vma = NULL;
vmg->start = start;
vmg->end = end;
vmg->pgoff = pgoff;
if (vmg->vma == prev)
vma_iter_set(vmg->vmi, start);
if (!just_expand) {
vmg->vma = NULL;
vmg->start = start;
vmg->end = end;
vmg->pgoff = pgoff;
if (vmg->vma == prev)
vma_iter_set(vmg->vmi, start);
}
return NULL;
}

View file

@ -59,6 +59,17 @@ enum vma_merge_state {
VMA_MERGE_SUCCESS,
};
enum vma_merge_flags {
VMG_FLAG_DEFAULT = 0,
/*
* If we can expand, simply do so. We know there is nothing to merge to
* the right. Does not reset state upon failure to merge. The VMA
* iterator is assumed to be positioned at the previous VMA, rather than
* at the gap.
*/
VMG_FLAG_JUST_EXPAND = 1 << 0,
};
/* Represents a VMA merge operation. */
struct vma_merge_struct {
struct mm_struct *mm;
@ -75,6 +86,7 @@ struct vma_merge_struct {
struct mempolicy *policy;
struct vm_userfaultfd_ctx uffd_ctx;
struct anon_vma_name *anon_name;
enum vma_merge_flags merge_flags;
enum vma_merge_state state;
};
@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.flags = flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
.merge_flags = VMG_FLAG_DEFAULT, \
}
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
.anon_name = anon_vma_name(vma_), \
.state = VMA_MERGE_START, \
.merge_flags = VMG_FLAG_DEFAULT, \
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
* failure method of leaving a gap where the MAP_FIXED mapping failed.
*/
mas_set_range(mas, vms->start, vms->end - 1);
if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
current->comm, current->pid);
/* Leaving vmas detached and in-tree may hamper recovery */
reattach_vmas(mas_detach);
} else {
/* Clean up the insertion of the unfortunate gap */
vms_complete_munmap_vmas(vms, mas_detach);
}
mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
/* Clean up the insertion of the unfortunate gap */
vms_complete_munmap_vmas(vms, mas_detach);
}
int

View file

@ -94,6 +94,7 @@
#define ARM_CPU_PART_NEOVERSE_V3 0xD84
#define ARM_CPU_PART_CORTEX_X925 0xD85
#define ARM_CPU_PART_CORTEX_A725 0xD87
#define ARM_CPU_PART_NEOVERSE_N3 0xD8E
#define APM_CPU_PART_XGENE 0x000
#define APM_CPU_VAR_POTENZA 0x00
@ -176,6 +177,7 @@
#define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3)
#define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925)
#define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725)
#define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3)
#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)

View file

@ -36,6 +36,20 @@
#define EFER_FFXSR (1<<_EFER_FFXSR)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/*
* Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
* Most MSRs support/allow only a subset of memory types, but the values
* themselves are common across all relevant MSRs.
*/
#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */
#define X86_MEMTYPE_WC 1ull /* Write Combining */
/* RESERVED 2 */
/* RESERVED 3 */
#define X86_MEMTYPE_WT 4ull /* Write Through */
#define X86_MEMTYPE_WP 5ull /* Write Protected */
#define X86_MEMTYPE_WB 6ull /* Write Back */
#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */
/* FRED MSRs */
#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */
#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */
@ -365,6 +379,12 @@
#define MSR_IA32_CR_PAT 0x00000277
#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \
((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \
(X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \
(X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \
(X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
#define MSR_IA32_DEBUGCTLMSR 0x000001d9
#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
@ -1159,15 +1179,6 @@
#define MSR_IA32_VMX_VMFUNC 0x00000491
#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
/* VMX_BASIC bits and bitmasks */
#define VMX_BASIC_VMCS_SIZE_SHIFT 32
#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
#define VMX_BASIC_64 0x0001000000000000LLU
#define VMX_BASIC_MEM_TYPE_SHIFT 50
#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
#define VMX_BASIC_MEM_TYPE_WB 6LLU
#define VMX_BASIC_INOUT 0x0040000000000000LLU
/* Resctrl MSRs: */
/* - Intel: */
#define MSR_IA32_L3_QOS_CFG 0xc81
@ -1185,11 +1196,6 @@
#define MSR_IA32_SMBA_BW_BASE 0xc0000280
#define MSR_IA32_EVT_CFG_BASE 0xc0000400
/* MSR_IA32_VMX_MISC bits */
#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
#define MSR_VM_IGNNE 0xc0010115

View file

@ -439,6 +439,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1

View file

@ -11,6 +11,9 @@
#ifndef __NR_getpgid
#define __NR_getpgid 132
#endif
#ifndef __NR_capget
#define __NR_capget 184
#endif
#ifndef __NR_gettid
#define __NR_gettid 224
#endif

View file

@ -11,6 +11,9 @@
#ifndef __NR_getpgid
#define __NR_getpgid 121
#endif
#ifndef __NR_capget
#define __NR_capget 125
#endif
#ifndef __NR_gettid
#define __NR_gettid 186
#endif

View file

@ -36,4 +36,19 @@
#define GENMASK_ULL(h, l) \
(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
#if !defined(__ASSEMBLY__)
/*
* Missing asm support
*
* __GENMASK_U128() depends on _BIT128() which would not work
* in the asm code, as it shifts an 'unsigned __init128' data
* type instead of direct representation of 128 bit constants
* such as long and unsigned long. The fundamental problem is
* that a 128 bit constant will get silently truncated by the
* gcc compiler.
*/
#define GENMASK_U128(h, l) \
(GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l))
#endif
#endif /* __LINUX_BITS_H */

View file

@ -9,16 +9,7 @@
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpacked"
#pragma GCC diagnostic ignored "-Wattributes"
#define __get_unaligned_t(type, ptr) ({ \
const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \
__pptr->x; \
})
#define __put_unaligned_t(type, val, ptr) do { \
struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \
__pptr->x = (val); \
} while (0)
#include <vdso/unaligned.h>
#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
#define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr))

View file

@ -12,4 +12,7 @@
(((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \
(~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h))))
#define __GENMASK_U128(h, l) \
((_BIT128((h)) << 1) - (_BIT128(l)))
#endif /* _UAPI_LINUX_BITS_H */

View file

@ -28,6 +28,23 @@
#define _BITUL(x) (_UL(1) << (x))
#define _BITULL(x) (_ULL(1) << (x))
#if !defined(__ASSEMBLY__)
/*
* Missing asm support
*
* __BIT128() would not work in the asm code, as it shifts an
* 'unsigned __init128' data type as direct representation of
* 128 bit constants is not supported in the gcc compiler, as
* they get silently truncated.
*
* TODO: Please revisit this implementation when gcc compiler
* starts representing 128 bit constants directly like long
* and unsigned long etc. Subsequently drop the comment for
* GENMASK_U128() which would then start supporting asm code.
*/
#define _BIT128(x) ((unsigned __int128)(1) << (x))
#endif
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))

View file

@ -0,0 +1,15 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __VDSO_UNALIGNED_H
#define __VDSO_UNALIGNED_H
#define __get_unaligned_t(type, ptr) ({ \
const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \
__pptr->x; \
})
#define __put_unaligned_t(type, val, ptr) do { \
struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \
__pptr->x = (val); \
} while (0)
#endif /* __VDSO_UNALIGNED_H */

View file

@ -704,8 +704,8 @@ ifeq ($(BUILD_BPF_SKEL),1)
BUILD_BPF_SKEL := 0
else
CLANG_VERSION := $(shell $(CLANG) --version | head -1 | sed 's/.*clang version \([[:digit:]]\+.[[:digit:]]\+.[[:digit:]]\+\).*/\1/g')
ifeq ($(call version-lt3,$(CLANG_VERSION),16.0.6),1)
$(warning Warning: Disabled BPF skeletons as at least $(CLANG) version 16.0.6 is reported to be a working setup with the current of BPF based perf features)
ifeq ($(call version-lt3,$(CLANG_VERSION),12.0.1),1)
$(warning Warning: Disabled BPF skeletons as reliable BTF generation needs at least $(CLANG) version 12.0.1)
BUILD_BPF_SKEL := 0
endif
endif

View file

@ -1399,7 +1399,7 @@ static const struct syscall_fmt syscall_fmts[] = {
.arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
{ .name = "waitid", .errpid = true,
.arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
{ .name = "write", .errpid = true,
{ .name = "write",
.arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
};

View file

@ -22,6 +22,7 @@ FILES=(
"include/vdso/bits.h"
"include/linux/const.h"
"include/vdso/const.h"
"include/vdso/unaligned.h"
"include/linux/hash.h"
"include/linux/list-sort.h"
"include/uapi/linux/hw_breakpoint.h"

View file

@ -19,35 +19,74 @@
TEST_RESULT=0
# skip if not supported
BLACKFUNC=`head -n 1 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
if [ -z "$BLACKFUNC" ]; then
BLACKFUNC_LIST=`head -n 5 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2`
if [ -z "$BLACKFUNC_LIST" ]; then
print_overall_skipped
exit 0
fi
# try to find vmlinux with DWARF debug info
VMLINUX_FILE=$(perf probe -v random_probe |& grep "Using.*for symbols" | sed -r 's/^Using (.*) for symbols$/\1/')
# remove all previously added probes
clear_all_probes
### adding blacklisted function
# functions from blacklist should be skipped by perf probe
! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
PERF_EXIT_CODE=$?
REGEX_SCOPE_FAIL="Failed to find scope of probe point"
REGEX_SKIP_MESSAGE=" is blacklisted function, skip it\."
REGEX_NOT_FOUND_MESSAGE="Probe point \'$BLACKFUNC\' not found."
REGEX_NOT_FOUND_MESSAGE="Probe point \'$RE_EVENT\' not found."
REGEX_ERROR_MESSAGE="Error: Failed to add events."
REGEX_INVALID_ARGUMENT="Failed to write event: Invalid argument"
REGEX_SYMBOL_FAIL="Failed to find symbol at $RE_ADDRESS"
REGEX_OUT_SECTION="$BLACKFUNC is out of \.\w+, skip it"
../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
CHECK_EXIT_CODE=$?
REGEX_OUT_SECTION="$RE_EVENT is out of \.\w+, skip it"
REGEX_MISSING_DECL_LINE="A function DIE doesn't have decl_line. Maybe broken DWARF?"
print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
(( TEST_RESULT += $? ))
BLACKFUNC=""
SKIP_DWARF=0
for BLACKFUNC in $BLACKFUNC_LIST; do
echo "Probing $BLACKFUNC"
# functions from blacklist should be skipped by perf probe
! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err
PERF_EXIT_CODE=$?
# check for bad DWARF polluting the result
../common/check_all_patterns_found.pl "$REGEX_MISSING_DECL_LINE" >/dev/null < $LOGS_DIR/adding_blacklisted.err
if [ $? -eq 0 ]; then
SKIP_DWARF=1
echo "Result polluted by broken DWARF, trying another probe"
# confirm that the broken DWARF comes from assembler
if [ -n "$VMLINUX_FILE" ]; then
readelf -wi "$VMLINUX_FILE" |
awk -v probe="$BLACKFUNC" '/DW_AT_language/ { comp_lang = $0 }
$0 ~ probe { if (comp_lang) { print comp_lang }; exit }' |
grep -q "MIPS assembler"
CHECK_EXIT_CODE=$?
if [ $CHECK_EXIT_CODE -ne 0 ]; then
SKIP_DWARF=0 # broken DWARF while available
break
fi
fi
else
../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err
CHECK_EXIT_CODE=$?
SKIP_DWARF=0
break
fi
done
if [ $SKIP_DWARF -eq 1 ]; then
print_testcase_skipped "adding blacklisted function $BLACKFUNC"
else
print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC"
(( TEST_RESULT += $? ))
fi
### listing not-added probe

View file

@ -288,6 +288,10 @@ int sys_enter_rename(struct syscall_enter_args *args)
augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
len += augmented_args->arg.size;
/* Every read from userspace is limited to value size */
if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
return 1; /* Failure: don't filter */
struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
@ -315,6 +319,10 @@ int sys_enter_renameat2(struct syscall_enter_args *args)
augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
len += augmented_args->arg.size;
/* Every read from userspace is limited to value size */
if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
return 1; /* Failure: don't filter */
struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
@ -423,8 +431,9 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
{
bool augmented, do_output = false;
int zero = 0, size, aug_size, index, output = 0,
int zero = 0, size, aug_size, index,
value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
unsigned int nr, *beauty_map;
struct beauty_payload_enter *payload;
void *arg, *payload_offset;
@ -477,6 +486,8 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
augmented = true;
} else if (size < 0 && size >= -6) { /* buffer */
index = -(size + 1);
barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
index &= 7; // Satisfy the bounds checking with the verifier in some kernels.
aug_size = args->args[index];
if (aug_size > TRACE_AUG_MAX_BUF)
@ -488,10 +499,17 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
}
}
/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
if (aug_size > value_size)
aug_size = value_size;
/* write data to payload */
if (augmented) {
int written = offsetof(struct augmented_arg, value) + aug_size;
if (written < 0 || written > sizeof(struct augmented_arg))
return 1;
((struct augmented_arg *)payload_offset)->size = aug_size;
output += written;
payload_offset += written;
@ -499,7 +517,7 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
}
}
if (!do_output)
if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
return 1;
return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);

View file

@ -7,13 +7,9 @@
#include "debug.h"
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <linux/capability.h>
#include <sys/syscall.h>
#ifndef SYS_capget
#define SYS_capget 90
#endif
#include <unistd.h>
#define MAX_LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
@ -21,9 +17,9 @@ bool perf_cap__capable(int cap, bool *used_root)
{
struct __user_cap_header_struct header = {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = getpid(),
.pid = 0,
};
struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S];
struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S] = {};
__u32 cap_val;
*used_root = false;

View file

@ -19,6 +19,7 @@
#include "util/bpf-filter.h"
#include "util/env.h"
#include "util/kvm-stat.h"
#include "util/stat.h"
#include "util/kwork.h"
#include "util/sample.h"
#include "util/lock-contention.h"
@ -1355,6 +1356,7 @@ PyMODINIT_FUNC PyInit_perf(void)
unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
#ifdef HAVE_KVM_STAT_SUPPORT
bool kvm_entry_event(struct evsel *evsel __maybe_unused)
{
return false;
@ -1384,6 +1386,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
char *decode __maybe_unused)
{
}
#endif // HAVE_KVM_STAT_SUPPORT
int find_scripts(char **scripts_array __maybe_unused, char **scripts_path_array __maybe_unused,
int num __maybe_unused, int pathlen __maybe_unused)

View file

@ -46,6 +46,11 @@ static const char *const *syscalltbl_native = syscalltbl_mips_n64;
#include <asm/syscalls.c>
const int syscalltbl_native_max_id = SYSCALLTBL_LOONGARCH_MAX_ID;
static const char *const *syscalltbl_native = syscalltbl_loongarch;
#else
const int syscalltbl_native_max_id = 0;
static const char *const syscalltbl_native[] = {
[0] = "unknown",
};
#endif
struct syscall {
@ -182,6 +187,11 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name)
return audit_name_to_syscall(name, tbl->audit_machine);
}
int syscalltbl__id_at_idx(struct syscalltbl *tbl __maybe_unused, int idx)
{
return idx;
}
int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused,
const char *syscall_glob __maybe_unused, int *idx __maybe_unused)
{

View file

@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
/*
* Access a cpumask in read-only mode (typically to check bits).
*/
const struct cpumask *cast_mask(struct bpf_cpumask *mask)
static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
{
return (const struct cpumask *)mask;
}

View file

@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
unsigned long long *count_verify;
uffd_test_ops_t *uffd_test_ops;
uffd_test_case_ops_t *uffd_test_case_ops;
pthread_barrier_t ready_for_fork;
atomic_bool ready_for_fork;
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
{
@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
pollfd[1].fd = pipefd[cpu*2];
pollfd[1].events = POLLIN;
/* Ready for parent thread to fork */
pthread_barrier_wait(&ready_for_fork);
ready_for_fork = true;
for (;;) {
ret = poll(pollfd, 2, -1);

View file

@ -33,6 +33,7 @@
#include <inttypes.h>
#include <stdint.h>
#include <sys/random.h>
#include <stdatomic.h>
#include "../kselftest.h"
#include "vm_util.h"
@ -104,7 +105,7 @@ extern bool map_shared;
extern bool test_uffdio_wp;
extern unsigned long long *count_verify;
extern volatile bool test_uffdio_copy_eexist;
extern pthread_barrier_t ready_for_fork;
extern atomic_bool ready_for_fork;
extern uffd_test_ops_t anon_uffd_test_ops;
extern uffd_test_ops_t shmem_uffd_test_ops;

View file

@ -241,8 +241,7 @@ static void *fork_event_consumer(void *data)
fork_event_args *args = data;
struct uffd_msg msg = { 0 };
/* Ready for parent thread to fork */
pthread_barrier_wait(&ready_for_fork);
ready_for_fork = true;
/* Read until a full msg received */
while (uffd_read_msg(args->parent_uffd, &msg));
@ -311,12 +310,11 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
/* Prepare a thread to resolve EVENT_FORK */
if (with_event) {
pthread_barrier_init(&ready_for_fork, NULL, 2);
ready_for_fork = false;
if (pthread_create(&thread, NULL, fork_event_consumer, &args))
err("pthread_create()");
/* Wait for child thread to start before forking */
pthread_barrier_wait(&ready_for_fork);
pthread_barrier_destroy(&ready_for_fork);
while (!ready_for_fork)
; /* Wait for the poll_thread to start executing before forking */
}
child = fork();
@ -781,7 +779,7 @@ static void uffd_sigbus_test_common(bool wp)
char c;
struct uffd_args args = { 0 };
pthread_barrier_init(&ready_for_fork, NULL, 2);
ready_for_fork = false;
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
@ -798,9 +796,8 @@ static void uffd_sigbus_test_common(bool wp)
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
/* Wait for child thread to start before forking */
pthread_barrier_wait(&ready_for_fork);
pthread_barrier_destroy(&ready_for_fork);
while (!ready_for_fork)
; /* Wait for the poll_thread to start executing before forking */
pid = fork();
if (pid < 0)
@ -841,7 +838,7 @@ static void uffd_events_test_common(bool wp)
char c;
struct uffd_args args = { 0 };
pthread_barrier_init(&ready_for_fork, NULL, 2);
ready_for_fork = false;
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
if (uffd_register(uffd, area_dst, nr_pages * page_size,
@ -852,9 +849,8 @@ static void uffd_events_test_common(bool wp)
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
/* Wait for child thread to start before forking */
pthread_barrier_wait(&ready_for_fork);
pthread_barrier_destroy(&ready_for_fork);
while (!ready_for_fork)
; /* Wait for the poll_thread to start executing before forking */
pid = fork();
if (pid < 0)

View file

@ -184,7 +184,7 @@ auto-test-targets := \
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) $(BPFOBJ)
$(CC) $(CFLAGS) -c $< -o $@
# Create all of the test targets object files, whose testcase objects will be

View file

@ -51,8 +51,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init)
SEC(".struct_ops.link")
struct sched_ext_ops create_dsq_ops = {
.init_task = create_dsq_init_task,
.exit_task = create_dsq_exit_task,
.init = create_dsq_init,
.init_task = (void *) create_dsq_init_task,
.exit_task = (void *) create_dsq_exit_task,
.init = (void *) create_dsq_init,
.name = "create_dsq",
};

View file

@ -35,8 +35,8 @@ void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
.select_cpu = ddsp_bogus_dsq_fail_select_cpu,
.exit = ddsp_bogus_dsq_fail_exit,
.select_cpu = (void *) ddsp_bogus_dsq_fail_select_cpu,
.exit = (void *) ddsp_bogus_dsq_fail_exit,
.name = "ddsp_bogus_dsq_fail",
.timeout_ms = 1000U,
};

View file

@ -32,8 +32,8 @@ void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
.select_cpu = ddsp_vtimelocal_fail_select_cpu,
.exit = ddsp_vtimelocal_fail_exit,
.select_cpu = (void *) ddsp_vtimelocal_fail_select_cpu,
.exit = (void *) ddsp_vtimelocal_fail_exit,
.name = "ddsp_vtimelocal_fail",
.timeout_ms = 1000U,
};

View file

@ -56,10 +56,10 @@ void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops dsp_local_on_ops = {
.select_cpu = dsp_local_on_select_cpu,
.enqueue = dsp_local_on_enqueue,
.dispatch = dsp_local_on_dispatch,
.exit = dsp_local_on_exit,
.select_cpu = (void *) dsp_local_on_select_cpu,
.enqueue = (void *) dsp_local_on_enqueue,
.dispatch = (void *) dsp_local_on_dispatch,
.exit = (void *) dsp_local_on_exit,
.name = "dsp_local_on",
.timeout_ms = 1000U,
};

View file

@ -12,10 +12,18 @@
char _license[] SEC("license") = "GPL";
u32 exit_kind;
void BPF_STRUCT_OPS_SLEEPABLE(enq_last_no_enq_fails_exit, struct scx_exit_info *info)
{
exit_kind = info->kind;
}
SEC(".struct_ops.link")
struct sched_ext_ops enq_last_no_enq_fails_ops = {
.name = "enq_last_no_enq_fails",
/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
.flags = SCX_OPS_ENQ_LAST,
.exit = (void *) enq_last_no_enq_fails_exit,
.timeout_ms = 1000U,
};

View file

@ -31,8 +31,12 @@ static enum scx_test_status run(void *ctx)
struct bpf_link *link;
link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
if (link) {
SCX_ERR("Incorrectly succeeded in to attaching scheduler");
if (!link) {
SCX_ERR("Incorrectly failed at attaching scheduler");
return SCX_TEST_FAIL;
}
if (!skel->bss->exit_kind) {
SCX_ERR("Incorrectly stayed loaded");
return SCX_TEST_FAIL;
}
@ -50,7 +54,7 @@ static void cleanup(void *ctx)
struct scx_test enq_last_no_enq_fails = {
.name = "enq_last_no_enq_fails",
.description = "Verify we fail to load a scheduler if we specify "
.description = "Verify we eject a scheduler if we specify "
"the SCX_OPS_ENQ_LAST flag without defining "
"ops.enqueue()",
.setup = setup,

View file

@ -36,8 +36,8 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
SEC(".struct_ops.link")
struct sched_ext_ops enq_select_cpu_fails_ops = {
.select_cpu = enq_select_cpu_fails_select_cpu,
.enqueue = enq_select_cpu_fails_enqueue,
.select_cpu = (void *) enq_select_cpu_fails_select_cpu,
.enqueue = (void *) enq_select_cpu_fails_enqueue,
.name = "enq_select_cpu_fails",
.timeout_ms = 1000U,
};

View file

@ -15,6 +15,8 @@ UEI_DEFINE(uei);
#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point)
#define DSQ_ID 0
s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
@ -31,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
if (exit_point == EXIT_ENQUEUE)
EXIT_CLEANLY();
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
}
void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
@ -39,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
if (exit_point == EXIT_DISPATCH)
EXIT_CLEANLY();
scx_bpf_consume(SCX_DSQ_GLOBAL);
scx_bpf_consume(DSQ_ID);
}
void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
@ -67,18 +69,18 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init)
if (exit_point == EXIT_INIT)
EXIT_CLEANLY();
return 0;
return scx_bpf_create_dsq(DSQ_ID, -1);
}
SEC(".struct_ops.link")
struct sched_ext_ops exit_ops = {
.select_cpu = exit_select_cpu,
.enqueue = exit_enqueue,
.dispatch = exit_dispatch,
.init_task = exit_init_task,
.enable = exit_enable,
.exit = exit_exit,
.init = exit_init,
.select_cpu = (void *) exit_select_cpu,
.enqueue = (void *) exit_enqueue,
.dispatch = (void *) exit_dispatch,
.init_task = (void *) exit_init_task,
.enable = (void *) exit_enable,
.exit = (void *) exit_exit,
.init = (void *) exit_init,
.name = "exit",
.timeout_ms = 1000U,
};

View file

@ -46,16 +46,16 @@ void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu)
SEC(".struct_ops.link")
struct sched_ext_ops hotplug_cb_ops = {
.cpu_online = hotplug_cpu_online,
.cpu_offline = hotplug_cpu_offline,
.exit = hotplug_exit,
.cpu_online = (void *) hotplug_cpu_online,
.cpu_offline = (void *) hotplug_cpu_offline,
.exit = (void *) hotplug_exit,
.name = "hotplug_cbs",
.timeout_ms = 1000U,
};
SEC(".struct_ops.link")
struct sched_ext_ops hotplug_nocb_ops = {
.exit = hotplug_exit,
.exit = (void *) hotplug_exit,
.name = "hotplug_nocbs",
.timeout_ms = 1000U,
};

View file

@ -45,9 +45,9 @@ void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)
SEC(".struct_ops.link")
struct sched_ext_ops init_enable_count_ops = {
.init_task = cnt_init_task,
.exit_task = cnt_exit_task,
.enable = cnt_enable,
.disable = cnt_disable,
.init_task = (void *) cnt_init_task,
.exit_task = (void *) cnt_exit_task,
.enable = (void *) cnt_enable,
.disable = (void *) cnt_disable,
.name = "init_enable_count",
};

View file

@ -131,34 +131,34 @@ void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
SEC(".struct_ops.link")
struct sched_ext_ops maximal_ops = {
.select_cpu = maximal_select_cpu,
.enqueue = maximal_enqueue,
.dequeue = maximal_dequeue,
.dispatch = maximal_dispatch,
.runnable = maximal_runnable,
.running = maximal_running,
.stopping = maximal_stopping,
.quiescent = maximal_quiescent,
.yield = maximal_yield,
.core_sched_before = maximal_core_sched_before,
.set_weight = maximal_set_weight,
.set_cpumask = maximal_set_cpumask,
.update_idle = maximal_update_idle,
.cpu_acquire = maximal_cpu_acquire,
.cpu_release = maximal_cpu_release,
.cpu_online = maximal_cpu_online,
.cpu_offline = maximal_cpu_offline,
.init_task = maximal_init_task,
.enable = maximal_enable,
.exit_task = maximal_exit_task,
.disable = maximal_disable,
.cgroup_init = maximal_cgroup_init,
.cgroup_exit = maximal_cgroup_exit,
.cgroup_prep_move = maximal_cgroup_prep_move,
.cgroup_move = maximal_cgroup_move,
.cgroup_cancel_move = maximal_cgroup_cancel_move,
.cgroup_set_weight = maximal_cgroup_set_weight,
.init = maximal_init,
.exit = maximal_exit,
.select_cpu = (void *) maximal_select_cpu,
.enqueue = (void *) maximal_enqueue,
.dequeue = (void *) maximal_dequeue,
.dispatch = (void *) maximal_dispatch,
.runnable = (void *) maximal_runnable,
.running = (void *) maximal_running,
.stopping = (void *) maximal_stopping,
.quiescent = (void *) maximal_quiescent,
.yield = (void *) maximal_yield,
.core_sched_before = (void *) maximal_core_sched_before,
.set_weight = (void *) maximal_set_weight,
.set_cpumask = (void *) maximal_set_cpumask,
.update_idle = (void *) maximal_update_idle,
.cpu_acquire = (void *) maximal_cpu_acquire,
.cpu_release = (void *) maximal_cpu_release,
.cpu_online = (void *) maximal_cpu_online,
.cpu_offline = (void *) maximal_cpu_offline,
.init_task = (void *) maximal_init_task,
.enable = (void *) maximal_enable,
.exit_task = (void *) maximal_exit_task,
.disable = (void *) maximal_disable,
.cgroup_init = (void *) maximal_cgroup_init,
.cgroup_exit = (void *) maximal_cgroup_exit,
.cgroup_prep_move = (void *) maximal_cgroup_prep_move,
.cgroup_move = (void *) maximal_cgroup_move,
.cgroup_cancel_move = (void *) maximal_cgroup_cancel_move,
.cgroup_set_weight = (void *) maximal_cgroup_set_weight,
.init = (void *) maximal_init,
.exit = (void *) maximal_exit,
.name = "maximal",
};

View file

@ -29,8 +29,8 @@ bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from,
SEC(".struct_ops.link")
struct sched_ext_ops maybe_null_success = {
.dispatch = maybe_null_success_dispatch,
.yield = maybe_null_success_yield,
.enable = maybe_null_running,
.dispatch = (void *) maybe_null_success_dispatch,
.yield = (void *) maybe_null_success_yield,
.enable = (void *) maybe_null_running,
.name = "minimal",
};

View file

@ -19,7 +19,7 @@ void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)
SEC(".struct_ops.link")
struct sched_ext_ops maybe_null_fail = {
.dispatch = maybe_null_fail_dispatch,
.enable = maybe_null_running,
.dispatch = (void *) maybe_null_fail_dispatch,
.enable = (void *) maybe_null_running,
.name = "maybe_null_fail_dispatch",
};

View file

@ -22,7 +22,7 @@ bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from,
SEC(".struct_ops.link")
struct sched_ext_ops maybe_null_fail = {
.yield = maybe_null_fail_yield,
.enable = maybe_null_running,
.yield = (void *) maybe_null_fail_yield,
.enable = (void *) maybe_null_running,
.name = "maybe_null_fail_yield",
};

View file

@ -28,6 +28,6 @@ void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops prog_run_ops = {
.exit = prog_run_exit,
.exit = (void *) prog_run_exit,
.name = "prog_run",
};

View file

@ -35,6 +35,6 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
SEC(".struct_ops.link")
struct sched_ext_ops select_cpu_dfl_ops = {
.enqueue = select_cpu_dfl_enqueue,
.enqueue = (void *) select_cpu_dfl_enqueue,
.name = "select_cpu_dfl",
};

View file

@ -82,8 +82,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
SEC(".struct_ops.link")
struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
.select_cpu = select_cpu_dfl_nodispatch_select_cpu,
.enqueue = select_cpu_dfl_nodispatch_enqueue,
.init_task = select_cpu_dfl_nodispatch_init_task,
.select_cpu = (void *) select_cpu_dfl_nodispatch_select_cpu,
.enqueue = (void *) select_cpu_dfl_nodispatch_enqueue,
.init_task = (void *) select_cpu_dfl_nodispatch_init_task,
.name = "select_cpu_dfl_nodispatch",
};

View file

@ -35,7 +35,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
SEC(".struct_ops.link")
struct sched_ext_ops select_cpu_dispatch_ops = {
.select_cpu = select_cpu_dispatch_select_cpu,
.select_cpu = (void *) select_cpu_dispatch_select_cpu,
.name = "select_cpu_dispatch",
.timeout_ms = 1000U,
};

View file

@ -30,8 +30,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
.select_cpu = select_cpu_dispatch_bad_dsq_select_cpu,
.exit = select_cpu_dispatch_bad_dsq_exit,
.select_cpu = (void *) select_cpu_dispatch_bad_dsq_select_cpu,
.exit = (void *) select_cpu_dispatch_bad_dsq_exit,
.name = "select_cpu_dispatch_bad_dsq",
.timeout_ms = 1000U,
};

View file

@ -31,8 +31,8 @@ void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)
SEC(".struct_ops.link")
struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
.select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu,
.exit = select_cpu_dispatch_dbl_dsp_exit,
.select_cpu = (void *) select_cpu_dispatch_dbl_dsp_select_cpu,
.exit = (void *) select_cpu_dispatch_dbl_dsp_exit,
.name = "select_cpu_dispatch_dbl_dsp",
.timeout_ms = 1000U,
};

View file

@ -81,12 +81,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
SEC(".struct_ops.link")
struct sched_ext_ops select_cpu_vtime_ops = {
.select_cpu = select_cpu_vtime_select_cpu,
.dispatch = select_cpu_vtime_dispatch,
.running = select_cpu_vtime_running,
.stopping = select_cpu_vtime_stopping,
.enable = select_cpu_vtime_enable,
.init = select_cpu_vtime_init,
.select_cpu = (void *) select_cpu_vtime_select_cpu,
.dispatch = (void *) select_cpu_vtime_dispatch,
.running = (void *) select_cpu_vtime_running,
.stopping = (void *) select_cpu_vtime_stopping,
.enable = (void *) select_cpu_vtime_enable,
.init = (void *) select_cpu_vtime_init,
.name = "select_cpu_vtime",
.timeout_ms = 1000U,
};

View file

@ -1522,6 +1522,45 @@ static bool test_copy_vma(void)
return true;
}
static bool test_expand_only_mode(void)
{
unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
struct mm_struct mm = {};
VMA_ITERATOR(vmi, &mm, 0);
struct vm_area_struct *vma_prev, *vma;
VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5);
/*
* Place a VMA prior to the one we're expanding so we assert that we do
* not erroneously try to traverse to the previous VMA even though we
* have, through the use of VMG_FLAG_JUST_EXPAND, indicated we do not
* need to do so.
*/
alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
/*
* We will be positioned at the prev VMA, but looking to expand to
* 0x9000.
*/
vma_iter_set(&vmi, 0x3000);
vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
vmg.prev = vma_prev;
vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
vma = vma_merge_new_range(&vmg);
ASSERT_NE(vma, NULL);
ASSERT_EQ(vma, vma_prev);
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
ASSERT_EQ(vma->vm_start, 0x3000);
ASSERT_EQ(vma->vm_end, 0x9000);
ASSERT_EQ(vma->vm_pgoff, 3);
ASSERT_TRUE(vma_write_started(vma));
ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
cleanup_mm(&mm, &vmi);
return true;
}
int main(void)
{
int num_tests = 0, num_fail = 0;
@ -1553,6 +1592,7 @@ int main(void)
TEST(vmi_prealloc_fail);
TEST(merge_extend);
TEST(copy_vma);
TEST(expand_only_mode);
#undef TEST