mirror of
https://github.com/torvalds/linux.git
synced 2024-11-01 04:53:36 +01:00
sched_ext: Improve error reporting during loading
When the BPF scheduler fails, ops.exit() allows rich error reporting through scx_exit_info. Use scx.exit() path consistently for all failures which can be caused by the BPF scheduler: - scx_ops_error() is called after ops.init() and ops.cgroup_init() failure to record error information. - ops.init_task() failure now uses scx_ops_error() instead of pr_err(). - The err_disable path updated to automatically trigger scx_ops_error() to cover cases that the error message hasn't already been generated and always return 0 indicating init success so that the error is reported through ops.exit(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: David Vernet <void@manifault.com> Cc: Daniel Hodges <hodges.daniel.scott@gmail.com> Cc: Changwoo Min <multics69@gmail.com> Cc: Andrea Righi <andrea.righi@linux.dev> Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
This commit is contained in:
parent
fcbc423577
commit
cc9877fb76
1 changed files with 19 additions and 11 deletions
|
@ -625,6 +625,10 @@ struct sched_ext_ops {
|
|||
/**
|
||||
* exit - Clean up after the BPF scheduler
|
||||
* @info: Exit info
|
||||
*
|
||||
* ops.exit() is also called on ops.init() failure, which is a bit
|
||||
* unusual. This is to allow rich reporting through @info on how
|
||||
* ops.init() failed.
|
||||
*/
|
||||
void (*exit)(struct scx_exit_info *info);
|
||||
|
||||
|
@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void)
|
|||
css->cgroup, &args);
|
||||
if (ret) {
|
||||
css_put(css);
|
||||
scx_ops_error("ops.cgroup_init() failed (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
tg->scx_flags |= SCX_TG_INITED;
|
||||
|
@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
|||
if (ret) {
|
||||
ret = ops_sanitize_err("init", ret);
|
||||
cpus_read_unlock();
|
||||
scx_ops_error("ops.init() failed (%d)", ret);
|
||||
goto err_disable;
|
||||
}
|
||||
}
|
||||
|
@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
|||
spin_lock_irq(&scx_tasks_lock);
|
||||
scx_task_iter_exit(&sti);
|
||||
spin_unlock_irq(&scx_tasks_lock);
|
||||
pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
|
||||
ret, p->comm, p->pid);
|
||||
scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
|
||||
ret, p->comm, p->pid);
|
||||
goto err_disable_unlock_all;
|
||||
}
|
||||
|
||||
|
@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
|||
|
||||
scx_ops_bypass(false);
|
||||
|
||||
/*
|
||||
* Returning an error code here would lose the recorded error
|
||||
* information. Exit indicating success so that the error is notified
|
||||
* through ops.exit() with all the details.
|
||||
*/
|
||||
if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
|
||||
WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
|
||||
ret = 0;
|
||||
goto err_disable;
|
||||
}
|
||||
|
||||
|
@ -5241,10 +5241,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
|||
scx_ops_bypass(false);
|
||||
err_disable:
|
||||
mutex_unlock(&scx_ops_enable_mutex);
|
||||
/* must be fully disabled before returning */
|
||||
scx_ops_disable(SCX_EXIT_ERROR);
|
||||
/*
|
||||
* Returning an error code here would not pass all the error information
|
||||
* to userspace. Record errno using scx_ops_error() for cases
|
||||
* scx_ops_error() wasn't already invoked and exit indicating success so
|
||||
* that the error is notified through ops.exit() with all the details.
|
||||
*
|
||||
* Flush scx_ops_disable_work to ensure that error is reported before
|
||||
* init completion.
|
||||
*/
|
||||
scx_ops_error("scx_ops_enable() failed (%d)", ret);
|
||||
kthread_flush_work(&scx_ops_disable_work);
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue