1 /* SPDX-License-Identifier: GPL-2.0 */ 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 2 /* 3 * BPF extensible scheduler class: Documentati 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@me 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 8 */ 9 #ifndef _LINUX_SCHED_EXT_H 9 #ifndef _LINUX_SCHED_EXT_H 10 #define _LINUX_SCHED_EXT_H 10 #define _LINUX_SCHED_EXT_H 11 11 12 #ifdef CONFIG_SCHED_CLASS_EXT 12 #ifdef CONFIG_SCHED_CLASS_EXT 13 13 14 #include <linux/llist.h> 14 #include <linux/llist.h> 15 #include <linux/rhashtable-types.h> 15 #include <linux/rhashtable-types.h> 16 16 17 enum scx_public_consts { 17 enum scx_public_consts { 18 SCX_OPS_NAME_LEN = 128, 18 SCX_OPS_NAME_LEN = 128, 19 19 20 SCX_SLICE_DFL = 20 * 1000000 20 SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ 21 SCX_SLICE_INF = U64_MAX, 21 SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ 22 }; 22 }; 23 23 24 /* 24 /* 25 * DSQ (dispatch queue) IDs are 64bit of the f 25 * DSQ (dispatch queue) IDs are 64bit of the format: 26 * 26 * 27 * Bits: [63] [62 .. 0] 27 * Bits: [63] [62 .. 0] 28 * [ B] [ ID ] 28 * [ B] [ ID ] 29 * 29 * 30 * B: 1 for IDs for built-in DSQs, 0 for op 30 * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs 31 * ID: 63 bit ID 31 * ID: 63 bit ID 32 * 32 * 33 * Built-in IDs: 33 * Built-in IDs: 34 * 34 * 35 * Bits: [63] [62] [61..32] [31 .. 0] 35 * Bits: [63] [62] [61..32] [31 .. 0] 36 * [ 1] [ L] [ R ] [ V ] 36 * [ 1] [ L] [ R ] [ V ] 37 * 37 * 38 * 1: 1 for built-in DSQs. 38 * 1: 1 for built-in DSQs. 39 * L: 1 for LOCAL_ON DSQ IDs, 0 for others 39 * L: 1 for LOCAL_ON DSQ IDs, 0 for others 40 * V: For LOCAL_ON DSQ IDs, a CPU number. F 40 * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. 41 */ 41 */ 42 enum scx_dsq_id_flags { 42 enum scx_dsq_id_flags { 43 SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, 43 SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, 44 SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, 44 SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, 45 45 46 SCX_DSQ_INVALID = SCX_DSQ_FLAG 46 SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, 47 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG 47 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, 48 SCX_DSQ_LOCAL = SCX_DSQ_FLAG 48 SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, 49 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG 49 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, 50 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLL 50 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, 51 }; 51 }; 52 52 53 /* 53 /* 54 * A dispatch queue (DSQ) can be either a FIFO 54 * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered 55 * queue. A built-in DSQ is always a FIFO. The 55 * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to 56 * buffer between the scheduler core and the B 56 * buffer between the scheduler core and the BPF scheduler. See the 57 * documentation for more details. 57 * documentation for more details. 58 */ 58 */ 59 struct scx_dispatch_q { 59 struct scx_dispatch_q { 60 raw_spinlock_t lock; 60 raw_spinlock_t lock; 61 struct list_head list; /* tas 61 struct list_head list; /* tasks in dispatch order */ 62 struct rb_root priq; /* use 62 struct rb_root priq; /* used to order by p->scx.dsq_vtime */ 63 u32 nr; 63 u32 nr; 64 u32 seq; /* use 64 u32 seq; /* used by BPF iter */ 65 u64 id; 65 u64 id; 66 struct rhash_head hash_node; 66 struct rhash_head hash_node; 67 struct llist_node free_node; 67 struct llist_node free_node; 68 struct rcu_head rcu; 68 struct rcu_head rcu; 69 }; 69 }; 70 70 71 /* scx_entity.flags */ 71 /* scx_entity.flags */ 72 enum scx_ent_flags { 72 enum scx_ent_flags { 73 SCX_TASK_QUEUED = 1 << 0, /* o 73 SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ 74 SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, / 74 SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ 75 SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* l 75 SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ 76 76 77 SCX_TASK_STATE_SHIFT = 8, /* b 77 SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ 78 SCX_TASK_STATE_BITS = 2, 78 SCX_TASK_STATE_BITS = 2, 79 SCX_TASK_STATE_MASK = ((1 << SCX_T 79 SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, 80 80 81 SCX_TASK_CURSOR = 1 << 31, /* 81 SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ 82 }; 82 }; 83 83 84 /* scx_entity.flags & SCX_TASK_STATE_MASK */ 84 /* scx_entity.flags & SCX_TASK_STATE_MASK */ 85 enum scx_task_state { 85 enum scx_task_state { 86 SCX_TASK_NONE, /* ops.init_ta 86 SCX_TASK_NONE, /* ops.init_task() not called yet */ 87 SCX_TASK_INIT, /* ops.init_ta 87 SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ 88 SCX_TASK_READY, /* fully initi 88 SCX_TASK_READY, /* fully initialized, but not in sched_ext */ 89 SCX_TASK_ENABLED, /* fully initi 89 SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ 90 90 91 SCX_TASK_NR_STATES, 91 SCX_TASK_NR_STATES, 92 }; 92 }; 93 93 94 /* scx_entity.dsq_flags */ 94 /* scx_entity.dsq_flags */ 95 enum scx_ent_dsq_flags { 95 enum scx_ent_dsq_flags { 96 SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* t 96 SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ 97 }; 97 }; 98 98 99 /* 99 /* 100 * Mask bits for scx_entity.kf_mask. Not all k 100 * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from 101 * everywhere and the following bits track whi 101 * everywhere and the following bits track which kfunc sets are currently 102 * allowed for %current. This simple per-task 102 * allowed for %current. This simple per-task tracking works because SCX ops 103 * nest in a limited way. BPF will likely impl 103 * nest in a limited way. BPF will likely implement a way to allow and disallow 104 * kfuncs depending on the calling context whi 104 * kfuncs depending on the calling context which will replace this manual 105 * mechanism. See scx_kf_allow(). 105 * mechanism. See scx_kf_allow(). 106 */ 106 */ 107 enum scx_kf_mask { 107 enum scx_kf_mask { 108 SCX_KF_UNLOCKED = 0, /* s 108 SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ 109 /* ENQUEUE and DISPATCH may be nested 109 /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ 110 SCX_KF_CPU_RELEASE = 1 << 0, /* o 110 SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ 111 /* ops.dequeue (in REST) may be nested 111 /* ops.dequeue (in REST) may be nested inside DISPATCH */ 112 SCX_KF_DISPATCH = 1 << 1, /* o 112 SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ 113 SCX_KF_ENQUEUE = 1 << 2, /* o 113 SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ 114 SCX_KF_SELECT_CPU = 1 << 3, /* o 114 SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ 115 SCX_KF_REST = 1 << 4, /* o 115 SCX_KF_REST = 1 << 4, /* other rq-locked operations */ 116 116 117 __SCX_KF_RQ_LOCKED = SCX_KF_CPU_R 117 __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | 118 SCX_KF_ENQUE 118 SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, 119 __SCX_KF_TERMINAL = SCX_KF_ENQUE 119 __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, 120 }; 120 }; 121 121 122 enum scx_dsq_lnode_flags { 122 enum scx_dsq_lnode_flags { 123 SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, 123 SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, 124 124 125 /* high 16 bits can be for iter cursor 125 /* high 16 bits can be for iter cursor flags */ 126 __SCX_DSQ_LNODE_PRIV_SHIFT = 16, 126 __SCX_DSQ_LNODE_PRIV_SHIFT = 16, 127 }; 127 }; 128 128 129 struct scx_dsq_list_node { 129 struct scx_dsq_list_node { 130 struct list_head node; 130 struct list_head node; 131 u32 flags; 131 u32 flags; 132 u32 priv; 132 u32 priv; /* can be used by iter cursor */ 133 }; 133 }; 134 134 135 /* 135 /* 136 * The following is embedded in task_struct an 136 * The following is embedded in task_struct and contains all fields necessary 137 * for a task to be scheduled by SCX. 137 * for a task to be scheduled by SCX. 138 */ 138 */ 139 struct sched_ext_entity { 139 struct sched_ext_entity { 140 struct scx_dispatch_q *dsq; 140 struct scx_dispatch_q *dsq; 141 struct scx_dsq_list_node dsq_list; 141 struct scx_dsq_list_node dsq_list; /* dispatch order */ 142 struct rb_node dsq_priq; 142 struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ 143 u32 dsq_seq; 143 u32 dsq_seq; 144 u32 dsq_flags; 144 u32 dsq_flags; /* protected by DSQ lock */ 145 u32 flags; 145 u32 flags; /* protected by rq lock */ 146 u32 weight; 146 u32 weight; 147 s32 sticky_cpu; 147 s32 sticky_cpu; 148 s32 holding_cpu; 148 s32 holding_cpu; 149 u32 kf_mask; 149 u32 kf_mask; /* see scx_kf_mask above */ 150 struct task_struct *kf_tasks[2]; 150 struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ 151 atomic_long_t ops_state; 151 atomic_long_t ops_state; 152 152 153 struct list_head runnable_node; 153 struct list_head runnable_node; /* rq->scx.runnable_list */ 154 unsigned long runnable_at; 154 unsigned long runnable_at; 155 155 156 #ifdef CONFIG_SCHED_CORE 156 #ifdef CONFIG_SCHED_CORE 157 u64 core_sched_at; 157 u64 core_sched_at; /* see scx_prio_less() */ 158 #endif 158 #endif 159 u64 ddsp_dsq_id; 159 u64 ddsp_dsq_id; 160 u64 ddsp_enq_flags 160 u64 ddsp_enq_flags; 161 161 162 /* BPF scheduler modifiable fields */ 162 /* BPF scheduler modifiable fields */ 163 163 164 /* 164 /* 165 * Runtime budget in nsecs. This is us 165 * Runtime budget in nsecs. This is usually set through 166 * scx_bpf_dispatch() but can also be 166 * scx_bpf_dispatch() but can also be modified directly by the BPF 167 * scheduler. Automatically decreased 167 * scheduler. Automatically decreased by SCX as the task executes. On 168 * depletion, a scheduling event is tr 168 * depletion, a scheduling event is triggered. 169 * 169 * 170 * This value is cleared to zero if th 170 * This value is cleared to zero if the task is preempted by 171 * %SCX_KICK_PREEMPT and shouldn't be 171 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the 172 * task ran. Use p->se.sum_exec_runtim 172 * task ran. Use p->se.sum_exec_runtime instead. 173 */ 173 */ 174 u64 slice; 174 u64 slice; 175 175 176 /* 176 /* 177 * Used to order tasks when dispatchin 177 * Used to order tasks when dispatching to the vtime-ordered priority 178 * queue of a dsq. This is usually set 178 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() 179 * but can also be modified directly b 179 * but can also be modified directly by the BPF scheduler. Modifying it 180 * while a task is queued on a dsq may 180 * while a task is queued on a dsq may mangle the ordering and is not 181 * recommended. 181 * recommended. 182 */ 182 */ 183 u64 dsq_vtime; 183 u64 dsq_vtime; 184 184 185 /* 185 /* 186 * If set, reject future sched_setsche 186 * If set, reject future sched_setscheduler(2) calls updating the policy 187 * to %SCHED_EXT with -%EACCES. 187 * to %SCHED_EXT with -%EACCES. 188 * 188 * 189 * Can be set from ops.init_task() whi 189 * Can be set from ops.init_task() while the BPF scheduler is being 190 * loaded (!scx_init_task_args->fork). 190 * loaded (!scx_init_task_args->fork). If set and the task's policy is 191 * already %SCHED_EXT, the task's poli 191 * already %SCHED_EXT, the task's policy is rejected and forcefully 192 * reverted to %SCHED_NORMAL. The numb 192 * reverted to %SCHED_NORMAL. The number of such events are reported 193 * through /sys/kernel/debug/sched_ext 193 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag 194 * during fork is not allowed. 194 * during fork is not allowed. 195 */ 195 */ 196 bool disallow; 196 bool disallow; /* reject switching into SCX */ 197 197 198 /* cold fields */ 198 /* cold fields */ 199 #ifdef CONFIG_EXT_GROUP_SCHED 199 #ifdef CONFIG_EXT_GROUP_SCHED 200 struct cgroup *cgrp_moving_f 200 struct cgroup *cgrp_moving_from; 201 #endif 201 #endif 202 /* must be the last field, see init_sc 202 /* must be the last field, see init_scx_entity() */ 203 struct list_head tasks_node; 203 struct list_head tasks_node; 204 }; 204 }; 205 205 206 void sched_ext_free(struct task_struct *p); 206 void sched_ext_free(struct task_struct *p); 207 void print_scx_info(const char *log_lvl, struc 207 void print_scx_info(const char *log_lvl, struct task_struct *p); 208 208 209 #else /* !CONFIG_SCHED_CLASS_EXT */ 209 #else /* !CONFIG_SCHED_CLASS_EXT */ 210 210 211 static inline void sched_ext_free(struct task_ 211 static inline void sched_ext_free(struct task_struct *p) {} 212 static inline void print_scx_info(const char * 212 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} 213 213 214 #endif /* CONFIG_SCHED_CLASS_EXT */ 214 #endif /* CONFIG_SCHED_CLASS_EXT */ 215 #endif /* _LINUX_SCHED_EXT_H */ 215 #endif /* _LINUX_SCHED_EXT_H */ 216 216
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.