1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * Generic pidhash and scalable, time-bounded 2 * Generic pidhash and scalable, time-bounded PID allocator 4 * 3 * 5 * (C) 2002-2003 Nadia Yvette Chambers, IBM 4 * (C) 2002-2003 Nadia Yvette Chambers, IBM 6 * (C) 2004 Nadia Yvette Chambers, Oracle 5 * (C) 2004 Nadia Yvette Chambers, Oracle 7 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat 8 * 7 * 9 * pid-structures are backing objects for task 8 * pid-structures are backing objects for tasks sharing a given ID to chain 10 * against. There is very little to them aside 9 * against. There is very little to them aside from hashing them and 11 * parking tasks using given ID's on a list. 10 * parking tasks using given ID's on a list. 12 * 11 * 13 * The hash is always changed with the tasklis 12 * The hash is always changed with the tasklist_lock write-acquired, 14 * and the hash is only accessed with the task 13 * and the hash is only accessed with the tasklist_lock at least 15 * read-acquired, so there's no additional SMP 14 * read-acquired, so there's no additional SMP locking needed here. 16 * 15 * 17 * We have a list of bitmap pages, which bitma 16 * We have a list of bitmap pages, which bitmaps represent the PID space. 18 * Allocating and freeing PIDs is completely l 17 * Allocating and freeing PIDs is completely lockless. The worst-case 19 * allocation scenario when all but one out of 18 * allocation scenario when all but one out of 1 million PIDs possible are 20 * allocated already: the scanning of 32 list 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 21 * bytes. The typical fastpath is a single suc 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 22 * 21 * 23 * Pid namespaces: 22 * Pid namespaces: 24 * (C) 2007 Pavel Emelyanov <xemul@openvz.o 23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. 25 * (C) 2007 Sukadev Bhattiprolu <sukadev@us 24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM 26 * Many thanks to Oleg Nesterov for commen 25 * Many thanks to Oleg Nesterov for comments and help 27 * 26 * 28 */ 27 */ 29 28 30 #include <linux/mm.h> 29 #include <linux/mm.h> 31 #include <linux/export.h> 30 #include <linux/export.h> 32 #include <linux/slab.h> 31 #include <linux/slab.h> 33 #include <linux/init.h> 32 #include <linux/init.h> 34 #include <linux/rculist.h> 33 #include <linux/rculist.h> 35 #include <linux/memblock.h> !! 34 #include <linux/bootmem.h> >> 35 #include <linux/hash.h> 36 #include <linux/pid_namespace.h> 36 #include <linux/pid_namespace.h> 37 #include <linux/init_task.h> 37 #include <linux/init_task.h> 38 #include <linux/syscalls.h> 38 #include <linux/syscalls.h> 39 #include <linux/proc_ns.h> 39 #include <linux/proc_ns.h> 40 #include <linux/refcount.h> !! 40 #include <linux/proc_fs.h> 41 #include <linux/anon_inodes.h> << 42 #include <linux/sched/signal.h> << 43 #include <linux/sched/task.h> 41 #include <linux/sched/task.h> 44 #include <linux/idr.h> 42 #include <linux/idr.h> 45 #include <linux/pidfs.h> << 46 #include <net/sock.h> << 47 #include <uapi/linux/pidfd.h> << 48 43 49 struct pid init_struct_pid = { 44 struct pid init_struct_pid = { 50 .count = REFCOUNT_INIT(1), !! 45 .count = ATOMIC_INIT(1), 51 .tasks = { 46 .tasks = { 52 { .first = NULL }, 47 { .first = NULL }, 53 { .first = NULL }, 48 { .first = NULL }, 54 { .first = NULL }, 49 { .first = NULL }, 55 }, 50 }, 56 .level = 0, 51 .level = 0, 57 .numbers = { { 52 .numbers = { { 58 .nr = 0, 53 .nr = 0, 59 .ns = &init_pid_ns 54 .ns = &init_pid_ns, 60 }, } 55 }, } 61 }; 56 }; 62 57 63 int pid_max = PID_MAX_DEFAULT; 58 int pid_max = PID_MAX_DEFAULT; 64 59 >> 60 #define RESERVED_PIDS 300 >> 61 65 int pid_max_min = RESERVED_PIDS + 1; 62 int pid_max_min = RESERVED_PIDS + 1; 66 int pid_max_max = PID_MAX_LIMIT; 63 int pid_max_max = PID_MAX_LIMIT; 67 /* << 68 * Pseudo filesystems start inode numbering af << 69 * PIDs as a natural offset. << 70 */ << 71 static u64 pidfs_ino = RESERVED_PIDS; << 72 64 73 /* 65 /* 74 * PID-map pages start out as NULL, they get a 66 * PID-map pages start out as NULL, they get allocated upon 75 * first use and are never deallocated. This w 67 * first use and are never deallocated. This way a low pid_max 76 * value does not cause lots of bitmaps to be 68 * value does not cause lots of bitmaps to be allocated, but 77 * the scheme scales to up to 4 million PIDs, 69 * the scheme scales to up to 4 million PIDs, runtime. 78 */ 70 */ 79 struct pid_namespace init_pid_ns = { 71 struct pid_namespace init_pid_ns = { 80 .ns.count = REFCOUNT_INIT(2), !! 72 .kref = KREF_INIT(2), 81 .idr = IDR_INIT(init_pid_ns.idr), 73 .idr = IDR_INIT(init_pid_ns.idr), 82 .pid_allocated = PIDNS_ADDING, 74 .pid_allocated = PIDNS_ADDING, 83 .level = 0, 75 .level = 0, 84 .child_reaper = &init_task, 76 .child_reaper = &init_task, 85 .user_ns = &init_user_ns, 77 .user_ns = &init_user_ns, 86 .ns.inum = PROC_PID_INIT_INO, 78 .ns.inum = PROC_PID_INIT_INO, 87 #ifdef CONFIG_PID_NS 79 #ifdef CONFIG_PID_NS 88 .ns.ops = &pidns_operations, 80 .ns.ops = &pidns_operations, 89 #endif 81 #endif 90 #if defined(CONFIG_SYSCTL) && defined(CONFIG_M << 91 .memfd_noexec_scope = MEMFD_NOEXEC_SCO << 92 #endif << 93 }; 82 }; 94 EXPORT_SYMBOL_GPL(init_pid_ns); 83 EXPORT_SYMBOL_GPL(init_pid_ns); 95 84 96 /* 85 /* 97 * Note: disable interrupts while the pidmap_l 86 * Note: disable interrupts while the pidmap_lock is held as an 98 * interrupt might come in and do read_lock(&t 87 * interrupt might come in and do read_lock(&tasklist_lock). 99 * 88 * 100 * If we don't disable interrupts there is a n 89 * If we don't disable interrupts there is a nasty deadlock between 101 * detach_pid()->free_pid() and another cpu th 90 * detach_pid()->free_pid() and another cpu that does 102 * spin_lock(&pidmap_lock) followed by an inte 91 * spin_lock(&pidmap_lock) followed by an interrupt routine that does 103 * read_lock(&tasklist_lock); 92 * read_lock(&tasklist_lock); 104 * 93 * 105 * After we clean up the tasklist_lock and kno 94 * After we clean up the tasklist_lock and know there are no 106 * irq handlers that take it we can leave the 95 * irq handlers that take it we can leave the interrupts enabled. 107 * For now it is easier to be safe than to pro 96 * For now it is easier to be safe than to prove it can't happen. 108 */ 97 */ 109 98 110 static __cacheline_aligned_in_smp DEFINE_SPIN 99 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 111 100 112 void put_pid(struct pid *pid) 101 void put_pid(struct pid *pid) 113 { 102 { 114 struct pid_namespace *ns; 103 struct pid_namespace *ns; 115 104 116 if (!pid) 105 if (!pid) 117 return; 106 return; 118 107 119 ns = pid->numbers[pid->level].ns; 108 ns = pid->numbers[pid->level].ns; 120 if (refcount_dec_and_test(&pid->count) !! 109 if ((atomic_read(&pid->count) == 1) || >> 110 atomic_dec_and_test(&pid->count)) { 121 kmem_cache_free(ns->pid_cachep 111 kmem_cache_free(ns->pid_cachep, pid); 122 put_pid_ns(ns); 112 put_pid_ns(ns); 123 } 113 } 124 } 114 } 125 EXPORT_SYMBOL_GPL(put_pid); 115 EXPORT_SYMBOL_GPL(put_pid); 126 116 127 static void delayed_put_pid(struct rcu_head *r 117 static void delayed_put_pid(struct rcu_head *rhp) 128 { 118 { 129 struct pid *pid = container_of(rhp, st 119 struct pid *pid = container_of(rhp, struct pid, rcu); 130 put_pid(pid); 120 put_pid(pid); 131 } 121 } 132 122 133 void free_pid(struct pid *pid) 123 void free_pid(struct pid *pid) 134 { 124 { 135 /* We can be called with write_lock_ir 125 /* We can be called with write_lock_irq(&tasklist_lock) held */ 136 int i; 126 int i; 137 unsigned long flags; 127 unsigned long flags; 138 128 139 spin_lock_irqsave(&pidmap_lock, flags) 129 spin_lock_irqsave(&pidmap_lock, flags); 140 for (i = 0; i <= pid->level; i++) { 130 for (i = 0; i <= pid->level; i++) { 141 struct upid *upid = pid->numbe 131 struct upid *upid = pid->numbers + i; 142 struct pid_namespace *ns = upi 132 struct pid_namespace *ns = upid->ns; 143 switch (--ns->pid_allocated) { 133 switch (--ns->pid_allocated) { 144 case 2: 134 case 2: 145 case 1: 135 case 1: 146 /* When all that is le 136 /* When all that is left in the pid namespace 147 * is the reaper wake 137 * is the reaper wake up the reaper. The reaper 148 * may be sleeping in 138 * may be sleeping in zap_pid_ns_processes(). 149 */ 139 */ 150 wake_up_process(ns->ch 140 wake_up_process(ns->child_reaper); 151 break; 141 break; 152 case PIDNS_ADDING: 142 case PIDNS_ADDING: 153 /* Handle a fork failu 143 /* Handle a fork failure of the first process */ 154 WARN_ON(ns->child_reap 144 WARN_ON(ns->child_reaper); 155 ns->pid_allocated = 0; 145 ns->pid_allocated = 0; >> 146 /* fall through */ >> 147 case 0: >> 148 schedule_work(&ns->proc_work); 156 break; 149 break; 157 } 150 } 158 151 159 idr_remove(&ns->idr, upid->nr) 152 idr_remove(&ns->idr, upid->nr); 160 } 153 } 161 spin_unlock_irqrestore(&pidmap_lock, f 154 spin_unlock_irqrestore(&pidmap_lock, flags); 162 155 163 call_rcu(&pid->rcu, delayed_put_pid); 156 call_rcu(&pid->rcu, delayed_put_pid); 164 } 157 } 165 158 166 struct pid *alloc_pid(struct pid_namespace *ns !! 159 struct pid *alloc_pid(struct pid_namespace *ns) 167 size_t set_tid_size) << 168 { 160 { 169 struct pid *pid; 161 struct pid *pid; 170 enum pid_type type; 162 enum pid_type type; 171 int i, nr; 163 int i, nr; 172 struct pid_namespace *tmp; 164 struct pid_namespace *tmp; 173 struct upid *upid; 165 struct upid *upid; 174 int retval = -ENOMEM; 166 int retval = -ENOMEM; 175 167 176 /* << 177 * set_tid_size contains the size of t << 178 * the most nested currently active PI << 179 * which PID to set for a process in t << 180 * up to set_tid_size PID namespaces. << 181 * for a process in all nested PID nam << 182 * never be greater than the current n << 183 */ << 184 if (set_tid_size > ns->level + 1) << 185 return ERR_PTR(-EINVAL); << 186 << 187 pid = kmem_cache_alloc(ns->pid_cachep, 168 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); 188 if (!pid) 169 if (!pid) 189 return ERR_PTR(retval); 170 return ERR_PTR(retval); 190 171 191 tmp = ns; 172 tmp = ns; 192 pid->level = ns->level; 173 pid->level = ns->level; 193 174 194 for (i = ns->level; i >= 0; i--) { 175 for (i = ns->level; i >= 0; i--) { 195 int tid = 0; !! 176 int pid_min = 1; 196 << 197 if (set_tid_size) { << 198 tid = set_tid[ns->leve << 199 << 200 retval = -EINVAL; << 201 if (tid < 1 || tid >= << 202 goto out_free; << 203 /* << 204 * Also fail if a PID << 205 * no PID 1 exists. << 206 */ << 207 if (tid != 1 && !tmp-> << 208 goto out_free; << 209 retval = -EPERM; << 210 if (!checkpoint_restor << 211 goto out_free; << 212 set_tid_size--; << 213 } << 214 177 215 idr_preload(GFP_KERNEL); 178 idr_preload(GFP_KERNEL); 216 spin_lock_irq(&pidmap_lock); 179 spin_lock_irq(&pidmap_lock); 217 180 218 if (tid) { !! 181 /* 219 nr = idr_alloc(&tmp->i !! 182 * init really needs pid 1, but after reaching the maximum 220 tid + 1 !! 183 * wrap back to RESERVED_PIDS 221 /* !! 184 */ 222 * If ENOSPC is return !! 185 if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) 223 * alreay in use. Retu !! 186 pid_min = RESERVED_PIDS; 224 */ << 225 if (nr == -ENOSPC) << 226 nr = -EEXIST; << 227 } else { << 228 int pid_min = 1; << 229 /* << 230 * init really needs p << 231 * maximum wrap back t << 232 */ << 233 if (idr_get_cursor(&tm << 234 pid_min = RESE << 235 187 236 /* !! 188 /* 237 * Store a null pointe !! 189 * Store a null pointer so find_pid_ns does not find 238 * a partially initial !! 190 * a partially initialized PID (see below). 239 */ !! 191 */ 240 nr = idr_alloc_cyclic( !! 192 nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, 241 !! 193 pid_max, GFP_ATOMIC); 242 } << 243 spin_unlock_irq(&pidmap_lock); 194 spin_unlock_irq(&pidmap_lock); 244 idr_preload_end(); 195 idr_preload_end(); 245 196 246 if (nr < 0) { 197 if (nr < 0) { 247 retval = (nr == -ENOSP 198 retval = (nr == -ENOSPC) ? -EAGAIN : nr; 248 goto out_free; 199 goto out_free; 249 } 200 } 250 201 251 pid->numbers[i].nr = nr; 202 pid->numbers[i].nr = nr; 252 pid->numbers[i].ns = tmp; 203 pid->numbers[i].ns = tmp; 253 tmp = tmp->parent; 204 tmp = tmp->parent; 254 } 205 } 255 206 256 /* !! 207 if (unlikely(is_child_reaper(pid))) { 257 * ENOMEM is not the most obvious choi !! 208 if (pid_ns_prepare_proc(ns)) 258 * where the child subreaper has alrea !! 209 goto out_free; 259 * namespace denies the creation of an !! 210 } 260 * is what we have exposed to userspac << 261 * documented behavior for pid namespa << 262 * change it even if there were an err << 263 */ << 264 retval = -ENOMEM; << 265 211 266 get_pid_ns(ns); 212 get_pid_ns(ns); 267 refcount_set(&pid->count, 1); !! 213 atomic_set(&pid->count, 1); 268 spin_lock_init(&pid->lock); << 269 for (type = 0; type < PIDTYPE_MAX; ++t 214 for (type = 0; type < PIDTYPE_MAX; ++type) 270 INIT_HLIST_HEAD(&pid->tasks[ty 215 INIT_HLIST_HEAD(&pid->tasks[type]); 271 216 272 init_waitqueue_head(&pid->wait_pidfd); << 273 INIT_HLIST_HEAD(&pid->inodes); << 274 << 275 upid = pid->numbers + ns->level; 217 upid = pid->numbers + ns->level; 276 spin_lock_irq(&pidmap_lock); 218 spin_lock_irq(&pidmap_lock); 277 if (!(ns->pid_allocated & PIDNS_ADDING 219 if (!(ns->pid_allocated & PIDNS_ADDING)) 278 goto out_unlock; 220 goto out_unlock; 279 pid->stashed = NULL; << 280 pid->ino = ++pidfs_ino; << 281 for ( ; upid >= pid->numbers; --upid) 221 for ( ; upid >= pid->numbers; --upid) { 282 /* Make the PID visible to fin 222 /* Make the PID visible to find_pid_ns. */ 283 idr_replace(&upid->ns->idr, pi 223 idr_replace(&upid->ns->idr, pid, upid->nr); 284 upid->ns->pid_allocated++; 224 upid->ns->pid_allocated++; 285 } 225 } 286 spin_unlock_irq(&pidmap_lock); 226 spin_unlock_irq(&pidmap_lock); 287 227 288 return pid; 228 return pid; 289 229 290 out_unlock: 230 out_unlock: 291 spin_unlock_irq(&pidmap_lock); 231 spin_unlock_irq(&pidmap_lock); 292 put_pid_ns(ns); 232 put_pid_ns(ns); 293 233 294 out_free: 234 out_free: 295 spin_lock_irq(&pidmap_lock); 235 spin_lock_irq(&pidmap_lock); 296 while (++i <= ns->level) { 236 while (++i <= ns->level) { 297 upid = pid->numbers + i; 237 upid = pid->numbers + i; 298 idr_remove(&upid->ns->idr, upi 238 idr_remove(&upid->ns->idr, upid->nr); 299 } 239 } 300 240 301 /* On failure to allocate the first pi 241 /* On failure to allocate the first pid, reset the state */ 302 if (ns->pid_allocated == PIDNS_ADDING) 242 if (ns->pid_allocated == PIDNS_ADDING) 303 idr_set_cursor(&ns->idr, 0); 243 idr_set_cursor(&ns->idr, 0); 304 244 305 spin_unlock_irq(&pidmap_lock); 245 spin_unlock_irq(&pidmap_lock); 306 246 307 kmem_cache_free(ns->pid_cachep, pid); 247 kmem_cache_free(ns->pid_cachep, pid); 308 return ERR_PTR(retval); 248 return ERR_PTR(retval); 309 } 249 } 310 250 311 void disable_pid_allocation(struct pid_namespa 251 void disable_pid_allocation(struct pid_namespace *ns) 312 { 252 { 313 spin_lock_irq(&pidmap_lock); 253 spin_lock_irq(&pidmap_lock); 314 ns->pid_allocated &= ~PIDNS_ADDING; 254 ns->pid_allocated &= ~PIDNS_ADDING; 315 spin_unlock_irq(&pidmap_lock); 255 spin_unlock_irq(&pidmap_lock); 316 } 256 } 317 257 318 struct pid *find_pid_ns(int nr, struct pid_nam 258 struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 319 { 259 { 320 return idr_find(&ns->idr, nr); 260 return idr_find(&ns->idr, nr); 321 } 261 } 322 EXPORT_SYMBOL_GPL(find_pid_ns); 262 EXPORT_SYMBOL_GPL(find_pid_ns); 323 263 324 struct pid *find_vpid(int nr) 264 struct pid *find_vpid(int nr) 325 { 265 { 326 return find_pid_ns(nr, task_active_pid 266 return find_pid_ns(nr, task_active_pid_ns(current)); 327 } 267 } 328 EXPORT_SYMBOL_GPL(find_vpid); 268 EXPORT_SYMBOL_GPL(find_vpid); 329 269 330 static struct pid **task_pid_ptr(struct task_s 270 static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) 331 { 271 { 332 return (type == PIDTYPE_PID) ? 272 return (type == PIDTYPE_PID) ? 333 &task->thread_pid : 273 &task->thread_pid : 334 &task->signal->pids[type]; 274 &task->signal->pids[type]; 335 } 275 } 336 276 337 /* 277 /* 338 * attach_pid() must be called with the taskli 278 * attach_pid() must be called with the tasklist_lock write-held. 339 */ 279 */ 340 void attach_pid(struct task_struct *task, enum 280 void attach_pid(struct task_struct *task, enum pid_type type) 341 { 281 { 342 struct pid *pid = *task_pid_ptr(task, 282 struct pid *pid = *task_pid_ptr(task, type); 343 hlist_add_head_rcu(&task->pid_links[ty 283 hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); 344 } 284 } 345 285 346 static void __change_pid(struct task_struct *t 286 static void __change_pid(struct task_struct *task, enum pid_type type, 347 struct pid *new) 287 struct pid *new) 348 { 288 { 349 struct pid **pid_ptr = task_pid_ptr(ta 289 struct pid **pid_ptr = task_pid_ptr(task, type); 350 struct pid *pid; 290 struct pid *pid; 351 int tmp; 291 int tmp; 352 292 353 pid = *pid_ptr; 293 pid = *pid_ptr; 354 294 355 hlist_del_rcu(&task->pid_links[type]); 295 hlist_del_rcu(&task->pid_links[type]); 356 *pid_ptr = new; 296 *pid_ptr = new; 357 297 358 if (type == PIDTYPE_PID) { << 359 WARN_ON_ONCE(pid_has_task(pid, << 360 wake_up_all(&pid->wait_pidfd); << 361 } << 362 << 363 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 298 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 364 if (pid_has_task(pid, tmp)) !! 299 if (!hlist_empty(&pid->tasks[tmp])) 365 return; 300 return; 366 301 367 free_pid(pid); 302 free_pid(pid); 368 } 303 } 369 304 370 void detach_pid(struct task_struct *task, enum 305 void detach_pid(struct task_struct *task, enum pid_type type) 371 { 306 { 372 __change_pid(task, type, NULL); 307 __change_pid(task, type, NULL); 373 } 308 } 374 309 375 void change_pid(struct task_struct *task, enum 310 void change_pid(struct task_struct *task, enum pid_type type, 376 struct pid *pid) 311 struct pid *pid) 377 { 312 { 378 __change_pid(task, type, pid); 313 __change_pid(task, type, pid); 379 attach_pid(task, type); 314 attach_pid(task, type); 380 } 315 } 381 316 382 void exchange_tids(struct task_struct *left, s << 383 { << 384 struct pid *pid1 = left->thread_pid; << 385 struct pid *pid2 = right->thread_pid; << 386 struct hlist_head *head1 = &pid1->task << 387 struct hlist_head *head2 = &pid2->task << 388 << 389 /* Swap the single entry tid lists */ << 390 hlists_swap_heads_rcu(head1, head2); << 391 << 392 /* Swap the per task_struct pid */ << 393 rcu_assign_pointer(left->thread_pid, p << 394 rcu_assign_pointer(right->thread_pid, << 395 << 396 /* Swap the cached value */ << 397 WRITE_ONCE(left->pid, pid_nr(pid2)); << 398 WRITE_ONCE(right->pid, pid_nr(pid1)); << 399 } << 400 << 401 /* transfer_pid is an optimization of attach_p 317 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 402 void transfer_pid(struct task_struct *old, str 318 void transfer_pid(struct task_struct *old, struct task_struct *new, 403 enum pid_type type) 319 enum pid_type type) 404 { 320 { 405 WARN_ON_ONCE(type == PIDTYPE_PID); !! 321 if (type == PIDTYPE_PID) >> 322 new->thread_pid = old->thread_pid; 406 hlist_replace_rcu(&old->pid_links[type 323 hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); 407 } 324 } 408 325 409 struct task_struct *pid_task(struct pid *pid, 326 struct task_struct *pid_task(struct pid *pid, enum pid_type type) 410 { 327 { 411 struct task_struct *result = NULL; 328 struct task_struct *result = NULL; 412 if (pid) { 329 if (pid) { 413 struct hlist_node *first; 330 struct hlist_node *first; 414 first = rcu_dereference_check( 331 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), 415 332 lockdep_tasklist_lock_is_held()); 416 if (first) 333 if (first) 417 result = hlist_entry(f 334 result = hlist_entry(first, struct task_struct, pid_links[(type)]); 418 } 335 } 419 return result; 336 return result; 420 } 337 } 421 EXPORT_SYMBOL(pid_task); 338 EXPORT_SYMBOL(pid_task); 422 339 423 /* 340 /* 424 * Must be called under rcu_read_lock(). 341 * Must be called under rcu_read_lock(). 425 */ 342 */ 426 struct task_struct *find_task_by_pid_ns(pid_t 343 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 427 { 344 { 428 RCU_LOCKDEP_WARN(!rcu_read_lock_held() 345 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), 429 "find_task_by_pid_ns( 346 "find_task_by_pid_ns() needs rcu_read_lock() protection"); 430 return pid_task(find_pid_ns(nr, ns), P 347 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 431 } 348 } 432 349 433 struct task_struct *find_task_by_vpid(pid_t vn 350 struct task_struct *find_task_by_vpid(pid_t vnr) 434 { 351 { 435 return find_task_by_pid_ns(vnr, task_a 352 return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); 436 } 353 } 437 354 438 struct task_struct *find_get_task_by_vpid(pid_ 355 struct task_struct *find_get_task_by_vpid(pid_t nr) 439 { 356 { 440 struct task_struct *task; 357 struct task_struct *task; 441 358 442 rcu_read_lock(); 359 rcu_read_lock(); 443 task = find_task_by_vpid(nr); 360 task = find_task_by_vpid(nr); 444 if (task) 361 if (task) 445 get_task_struct(task); 362 get_task_struct(task); 446 rcu_read_unlock(); 363 rcu_read_unlock(); 447 364 448 return task; 365 return task; 449 } 366 } 450 367 451 struct pid *get_task_pid(struct task_struct *t 368 struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 452 { 369 { 453 struct pid *pid; 370 struct pid *pid; 454 rcu_read_lock(); 371 rcu_read_lock(); 455 pid = get_pid(rcu_dereference(*task_pi 372 pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); 456 rcu_read_unlock(); 373 rcu_read_unlock(); 457 return pid; 374 return pid; 458 } 375 } 459 EXPORT_SYMBOL_GPL(get_task_pid); 376 EXPORT_SYMBOL_GPL(get_task_pid); 460 377 461 struct task_struct *get_pid_task(struct pid *p 378 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 462 { 379 { 463 struct task_struct *result; 380 struct task_struct *result; 464 rcu_read_lock(); 381 rcu_read_lock(); 465 result = pid_task(pid, type); 382 result = pid_task(pid, type); 466 if (result) 383 if (result) 467 get_task_struct(result); 384 get_task_struct(result); 468 rcu_read_unlock(); 385 rcu_read_unlock(); 469 return result; 386 return result; 470 } 387 } 471 EXPORT_SYMBOL_GPL(get_pid_task); 388 EXPORT_SYMBOL_GPL(get_pid_task); 472 389 473 struct pid *find_get_pid(pid_t nr) 390 struct pid *find_get_pid(pid_t nr) 474 { 391 { 475 struct pid *pid; 392 struct pid *pid; 476 393 477 rcu_read_lock(); 394 rcu_read_lock(); 478 pid = get_pid(find_vpid(nr)); 395 pid = get_pid(find_vpid(nr)); 479 rcu_read_unlock(); 396 rcu_read_unlock(); 480 397 481 return pid; 398 return pid; 482 } 399 } 483 EXPORT_SYMBOL_GPL(find_get_pid); 400 EXPORT_SYMBOL_GPL(find_get_pid); 484 401 485 pid_t pid_nr_ns(struct pid *pid, struct pid_na 402 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) 486 { 403 { 487 struct upid *upid; 404 struct upid *upid; 488 pid_t nr = 0; 405 pid_t nr = 0; 489 406 490 if (pid && ns->level <= pid->level) { 407 if (pid && ns->level <= pid->level) { 491 upid = &pid->numbers[ns->level 408 upid = &pid->numbers[ns->level]; 492 if (upid->ns == ns) 409 if (upid->ns == ns) 493 nr = upid->nr; 410 nr = upid->nr; 494 } 411 } 495 return nr; 412 return nr; 496 } 413 } 497 EXPORT_SYMBOL_GPL(pid_nr_ns); 414 EXPORT_SYMBOL_GPL(pid_nr_ns); 498 415 499 pid_t pid_vnr(struct pid *pid) 416 pid_t pid_vnr(struct pid *pid) 500 { 417 { 501 return pid_nr_ns(pid, task_active_pid_ 418 return pid_nr_ns(pid, task_active_pid_ns(current)); 502 } 419 } 503 EXPORT_SYMBOL_GPL(pid_vnr); 420 EXPORT_SYMBOL_GPL(pid_vnr); 504 421 505 pid_t __task_pid_nr_ns(struct task_struct *tas 422 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, 506 struct pid_namespace * 423 struct pid_namespace *ns) 507 { 424 { 508 pid_t nr = 0; 425 pid_t nr = 0; 509 426 510 rcu_read_lock(); 427 rcu_read_lock(); 511 if (!ns) 428 if (!ns) 512 ns = task_active_pid_ns(curren 429 ns = task_active_pid_ns(current); 513 nr = pid_nr_ns(rcu_dereference(*task_p !! 430 if (likely(pid_alive(task))) >> 431 nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); 514 rcu_read_unlock(); 432 rcu_read_unlock(); 515 433 516 return nr; 434 return nr; 517 } 435 } 518 EXPORT_SYMBOL(__task_pid_nr_ns); 436 EXPORT_SYMBOL(__task_pid_nr_ns); 519 437 520 struct pid_namespace *task_active_pid_ns(struc 438 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) 521 { 439 { 522 return ns_of_pid(task_pid(tsk)); 440 return ns_of_pid(task_pid(tsk)); 523 } 441 } 524 EXPORT_SYMBOL_GPL(task_active_pid_ns); 442 EXPORT_SYMBOL_GPL(task_active_pid_ns); 525 443 526 /* 444 /* 527 * Used by proc to find the first pid that is 445 * Used by proc to find the first pid that is greater than or equal to nr. 528 * 446 * 529 * If there is a pid at nr this function is ex 447 * If there is a pid at nr this function is exactly the same as find_pid_ns. 530 */ 448 */ 531 struct pid *find_ge_pid(int nr, struct pid_nam 449 struct pid *find_ge_pid(int nr, struct pid_namespace *ns) 532 { 450 { 533 return idr_get_next(&ns->idr, &nr); 451 return idr_get_next(&ns->idr, &nr); 534 } 452 } 535 EXPORT_SYMBOL_GPL(find_ge_pid); << 536 << 537 struct pid *pidfd_get_pid(unsigned int fd, uns << 538 { << 539 struct fd f; << 540 struct pid *pid; << 541 << 542 f = fdget(fd); << 543 if (!fd_file(f)) << 544 return ERR_PTR(-EBADF); << 545 << 546 pid = pidfd_pid(fd_file(f)); << 547 if (!IS_ERR(pid)) { << 548 get_pid(pid); << 549 *flags = fd_file(f)->f_flags; << 550 } << 551 << 552 fdput(f); << 553 return pid; << 554 } << 555 << 556 /** << 557 * pidfd_get_task() - Get the task associated << 558 * << 559 * @pidfd: pidfd for which to get the task << 560 * @flags: flags associated with this pidfd << 561 * << 562 * Return the task associated with @pidfd. The << 563 * the returned task. The caller is responsibl << 564 * << 565 * Return: On success, the task_struct associa << 566 * On error, a negative errno number w << 567 */ << 568 struct task_struct *pidfd_get_task(int pidfd, << 569 { << 570 unsigned int f_flags; << 571 struct pid *pid; << 572 struct task_struct *task; << 573 << 574 pid = pidfd_get_pid(pidfd, &f_flags); << 575 if (IS_ERR(pid)) << 576 return ERR_CAST(pid); << 577 << 578 task = get_pid_task(pid, PIDTYPE_TGID) << 579 put_pid(pid); << 580 if (!task) << 581 return ERR_PTR(-ESRCH); << 582 << 583 *flags = f_flags; << 584 return task; << 585 } << 586 << 587 /** << 588 * pidfd_create() - Create a new pid file desc << 589 * << 590 * @pid: struct pid that the pidfd will refe << 591 * @flags: flags to pass << 592 * << 593 * This creates a new pid file descriptor with << 594 * << 595 * Note, that this function can only be called << 596 * been unshared to avoid leaking the pidfd to << 597 * << 598 * This symbol should not be explicitly export << 599 * << 600 * Return: On success, a cloexec pidfd is retu << 601 * On error, a negative errno number w << 602 */ << 603 static int pidfd_create(struct pid *pid, unsig << 604 { << 605 int pidfd; << 606 struct file *pidfd_file; << 607 << 608 pidfd = pidfd_prepare(pid, flags, &pid << 609 if (pidfd < 0) << 610 return pidfd; << 611 << 612 fd_install(pidfd, pidfd_file); << 613 return pidfd; << 614 } << 615 << 616 /** << 617 * sys_pidfd_open() - Open new pid file descri << 618 * << 619 * @pid: pid for which to retrieve a pidfd << 620 * @flags: flags to pass << 621 * << 622 * This creates a new pid file descriptor with << 623 * the task identified by @pid. Without PIDFD_ << 624 * must be a thread-group leader. << 625 * << 626 * Return: On success, a cloexec pidfd is retu << 627 * On error, a negative errno number w << 628 */ << 629 SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsign << 630 { << 631 int fd; << 632 struct pid *p; << 633 << 634 if (flags & ~(PIDFD_NONBLOCK | PIDFD_T << 635 return -EINVAL; << 636 << 637 if (pid <= 0) << 638 return -EINVAL; << 639 << 640 p = find_get_pid(pid); << 641 if (!p) << 642 return -ESRCH; << 643 << 644 fd = pidfd_create(p, flags); << 645 << 646 put_pid(p); << 647 return fd; << 648 } << 649 453 650 void __init pid_idr_init(void) 454 void __init pid_idr_init(void) 651 { 455 { 652 /* Verify no one has done anything sil 456 /* Verify no one has done anything silly: */ 653 BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_AD 457 BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); 654 458 655 /* bump default and minimum pid_max ba 459 /* bump default and minimum pid_max based on number of cpus */ 656 pid_max = min(pid_max_max, max_t(int, 460 pid_max = min(pid_max_max, max_t(int, pid_max, 657 PIDS_PER_CPU_D 461 PIDS_PER_CPU_DEFAULT * num_possible_cpus())); 658 pid_max_min = max_t(int, pid_max_min, 462 pid_max_min = max_t(int, pid_max_min, 659 PIDS_PER_CPU_M 463 PIDS_PER_CPU_MIN * num_possible_cpus()); 660 pr_info("pid_max: default: %u minimum: 464 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); 661 465 662 idr_init(&init_pid_ns.idr); 466 idr_init(&init_pid_ns.idr); 663 467 664 init_pid_ns.pid_cachep = kmem_cache_cr !! 468 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 665 struct_size_t(struct p !! 469 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); 666 __alignof__(struct pid << 667 SLAB_HWCACHE_ALIGN | S << 668 NULL); << 669 } << 670 << 671 static struct file *__pidfd_fget(struct task_s << 672 { << 673 struct file *file; << 674 int ret; << 675 << 676 ret = down_read_killable(&task->signal << 677 if (ret) << 678 return ERR_PTR(ret); << 679 << 680 if (ptrace_may_access(task, PTRACE_MOD << 681 file = fget_task(task, fd); << 682 else << 683 file = ERR_PTR(-EPERM); << 684 << 685 up_read(&task->signal->exec_update_loc << 686 << 687 if (!file) { << 688 /* << 689 * It is possible that the tar << 690 * either: << 691 * 1. before exit_signals(), w << 692 * 2. before exit_files() take << 693 * 3. after exit_files() relea << 694 * this has PF_EXITING, sin << 695 * __pidfd_fget() returns E << 696 * In case 3 we get EBADF, but << 697 * the task is currently exiti << 698 * struct, so we fix it up. << 699 */ << 700 if (task->flags & PF_EXITING) << 701 file = ERR_PTR(-ESRCH) << 702 else << 703 file = ERR_PTR(-EBADF) << 704 } << 705 << 706 return file; << 707 } << 708 << 709 static int pidfd_getfd(struct pid *pid, int fd << 710 { << 711 struct task_struct *task; << 712 struct file *file; << 713 int ret; << 714 << 715 task = get_pid_task(pid, PIDTYPE_PID); << 716 if (!task) << 717 return -ESRCH; << 718 << 719 file = __pidfd_fget(task, fd); << 720 put_task_struct(task); << 721 if (IS_ERR(file)) << 722 return PTR_ERR(file); << 723 << 724 ret = receive_fd(file, NULL, O_CLOEXEC << 725 fput(file); << 726 << 727 return ret; << 728 } << 729 << 730 /** << 731 * sys_pidfd_getfd() - Get a file descriptor f << 732 * << 733 * @pidfd: the pidfd file descriptor of t << 734 * @fd: the file descriptor number to << 735 * @flags: flags on how to get the fd (re << 736 * << 737 * This syscall gets a copy of a file descript << 738 * based on the pidfd, and file descriptor num << 739 * the calling process has the ability to ptra << 740 * by the pidfd. The process which is having i << 741 * is otherwise unaffected. << 742 * << 743 * Return: On success, a cloexec file descript << 744 * On error, a negative errno number w << 745 */ << 746 SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, << 747 unsigned int, flags) << 748 { << 749 struct pid *pid; << 750 struct fd f; << 751 int ret; << 752 << 753 /* flags is currently unused - make su << 754 if (flags) << 755 return -EINVAL; << 756 << 757 f = fdget(pidfd); << 758 if (!fd_file(f)) << 759 return -EBADF; << 760 << 761 pid = pidfd_pid(fd_file(f)); << 762 if (IS_ERR(pid)) << 763 ret = PTR_ERR(pid); << 764 else << 765 ret = pidfd_getfd(pid, fd); << 766 << 767 fdput(f); << 768 return ret; << 769 } 470 } 770 471
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.