1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 #include "napi.h" 30 #include "eventfd.h" 31 32 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 33 IORING_REGISTER_LAST + IORING_OP_LAST) 34 35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 36 unsigned nr_args) 37 { 38 struct io_uring_probe *p; 39 size_t size; 40 int i, ret; 41 42 if (nr_args > IORING_OP_LAST) 43 nr_args = IORING_OP_LAST; 44 45 size = struct_size(p, ops, nr_args); 46 p = kzalloc(size, GFP_KERNEL); 47 if (!p) 48 return -ENOMEM; 49 50 ret = -EFAULT; 51 if (copy_from_user(p, arg, size)) 52 goto out; 53 ret = -EINVAL; 54 if (memchr_inv(p, 0, size)) 55 goto out; 56 57 p->last_op = IORING_OP_LAST - 1; 58 59 for (i = 0; i < nr_args; i++) { 60 p->ops[i].op = i; 61 if (io_uring_op_supported(i)) 62 p->ops[i].flags = IO_URING_OP_SUPPORTED; 63 } 64 p->ops_len = i; 65 66 ret = 0; 67 if (copy_to_user(arg, p, size)) 68 ret = -EFAULT; 69 out: 70 kfree(p); 71 return ret; 72 } 73 74 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 75 { 76 const struct cred *creds; 77 78 creds = xa_erase(&ctx->personalities, id); 79 if (creds) { 80 put_cred(creds); 81 return 0; 82 } 83 84 return -EINVAL; 85 } 86 87 88 static int io_register_personality(struct io_ring_ctx *ctx) 89 { 90 const struct cred *creds; 91 u32 id; 92 int ret; 93 94 creds = get_current_cred(); 95 96 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 97 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 98 if (ret < 0) { 99 put_cred(creds); 100 return ret; 101 } 102 return id; 103 } 104 105 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 106 void __user *arg, unsigned int nr_args) 107 { 108 struct io_uring_restriction *res; 109 size_t size; 110 int i, ret; 111 112 /* Restrictions allowed only if rings started disabled */ 113 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 114 return -EBADFD; 115 116 /* We allow only a single restrictions registration */ 117 if (ctx->restrictions.registered) 118 return -EBUSY; 119 120 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 121 return -EINVAL; 122 123 size = array_size(nr_args, sizeof(*res)); 124 if (size == SIZE_MAX) 125 return -EOVERFLOW; 126 127 res = memdup_user(arg, size); 128 if (IS_ERR(res)) 129 return PTR_ERR(res); 130 131 ret = 0; 132 133 for (i = 0; i < nr_args; i++) { 134 switch (res[i].opcode) { 135 case IORING_RESTRICTION_REGISTER_OP: 136 if (res[i].register_op >= IORING_REGISTER_LAST) { 137 ret = -EINVAL; 138 goto out; 139 } 140 141 __set_bit(res[i].register_op, 142 ctx->restrictions.register_op); 143 break; 144 case IORING_RESTRICTION_SQE_OP: 145 if (res[i].sqe_op >= IORING_OP_LAST) { 146 ret = -EINVAL; 147 goto out; 148 } 149 150 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 151 break; 152 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 153 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 154 break; 155 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 156 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 157 break; 158 default: 159 ret = -EINVAL; 160 goto out; 161 } 162 } 163 164 out: 165 /* Reset all restrictions if an error happened */ 166 if (ret != 0) 167 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 168 else 169 ctx->restrictions.registered = true; 170 171 kfree(res); 172 return ret; 173 } 174 175 static int io_register_enable_rings(struct io_ring_ctx *ctx) 176 { 177 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 178 return -EBADFD; 179 180 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 181 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 182 /* 183 * Lazy activation attempts would fail if it was polled before 184 * submitter_task is set. 185 */ 186 if (wq_has_sleeper(&ctx->poll_wq)) 187 io_activate_pollwq(ctx); 188 } 189 190 if (ctx->restrictions.registered) 191 ctx->restricted = 1; 192 193 ctx->flags &= ~IORING_SETUP_R_DISABLED; 194 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 195 wake_up(&ctx->sq_data->wait); 196 return 0; 197 } 198 199 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 200 cpumask_var_t new_mask) 201 { 202 int ret; 203 204 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 205 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 206 } else { 207 mutex_unlock(&ctx->uring_lock); 208 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 209 mutex_lock(&ctx->uring_lock); 210 } 211 212 return ret; 213 } 214 215 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 216 void __user *arg, unsigned len) 217 { 218 cpumask_var_t new_mask; 219 int ret; 220 221 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 222 return -ENOMEM; 223 224 cpumask_clear(new_mask); 225 if (len > cpumask_size()) 226 len = cpumask_size(); 227 228 #ifdef CONFIG_COMPAT 229 if (in_compat_syscall()) 230 ret = compat_get_bitmap(cpumask_bits(new_mask), 231 (const compat_ulong_t __user *)arg, 232 len * 8 /* CHAR_BIT */); 233 else 234 #endif 235 ret = copy_from_user(new_mask, arg, len); 236 237 if (ret) { 238 free_cpumask_var(new_mask); 239 return -EFAULT; 240 } 241 242 ret = __io_register_iowq_aff(ctx, new_mask); 243 free_cpumask_var(new_mask); 244 return ret; 245 } 246 247 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 248 { 249 return __io_register_iowq_aff(ctx, NULL); 250 } 251 252 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 253 void __user *arg) 254 __must_hold(&ctx->uring_lock) 255 { 256 struct io_tctx_node *node; 257 struct io_uring_task *tctx = NULL; 258 struct io_sq_data *sqd = NULL; 259 __u32 new_count[2]; 260 int i, ret; 261 262 if (copy_from_user(new_count, arg, sizeof(new_count))) 263 return -EFAULT; 264 for (i = 0; i < ARRAY_SIZE(new_count); i++) 265 if (new_count[i] > INT_MAX) 266 return -EINVAL; 267 268 if (ctx->flags & IORING_SETUP_SQPOLL) { 269 sqd = ctx->sq_data; 270 if (sqd) { 271 /* 272 * Observe the correct sqd->lock -> ctx->uring_lock 273 * ordering. Fine to drop uring_lock here, we hold 274 * a ref to the ctx. 275 */ 276 refcount_inc(&sqd->refs); 277 mutex_unlock(&ctx->uring_lock); 278 mutex_lock(&sqd->lock); 279 mutex_lock(&ctx->uring_lock); 280 if (sqd->thread) 281 tctx = sqd->thread->io_uring; 282 } 283 } else { 284 tctx = current->io_uring; 285 } 286 287 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 288 289 for (i = 0; i < ARRAY_SIZE(new_count); i++) 290 if (new_count[i]) 291 ctx->iowq_limits[i] = new_count[i]; 292 ctx->iowq_limits_set = true; 293 294 if (tctx && tctx->io_wq) { 295 ret = io_wq_max_workers(tctx->io_wq, new_count); 296 if (ret) 297 goto err; 298 } else { 299 memset(new_count, 0, sizeof(new_count)); 300 } 301 302 if (sqd) { 303 mutex_unlock(&ctx->uring_lock); 304 mutex_unlock(&sqd->lock); 305 io_put_sq_data(sqd); 306 mutex_lock(&ctx->uring_lock); 307 } 308 309 if (copy_to_user(arg, new_count, sizeof(new_count))) 310 return -EFAULT; 311 312 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 313 if (sqd) 314 return 0; 315 316 /* now propagate the restriction to all registered users */ 317 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 318 tctx = node->task->io_uring; 319 if (WARN_ON_ONCE(!tctx->io_wq)) 320 continue; 321 322 for (i = 0; i < ARRAY_SIZE(new_count); i++) 323 new_count[i] = ctx->iowq_limits[i]; 324 /* ignore errors, it always returns zero anyway */ 325 (void)io_wq_max_workers(tctx->io_wq, new_count); 326 } 327 return 0; 328 err: 329 if (sqd) { 330 mutex_unlock(&ctx->uring_lock); 331 mutex_unlock(&sqd->lock); 332 io_put_sq_data(sqd); 333 mutex_lock(&ctx->uring_lock); 334 } 335 return ret; 336 } 337 338 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 339 void __user *arg, unsigned nr_args) 340 __releases(ctx->uring_lock) 341 __acquires(ctx->uring_lock) 342 { 343 int ret; 344 345 /* 346 * We don't quiesce the refs for register anymore and so it can't be 347 * dying as we're holding a file ref here. 348 */ 349 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 350 return -ENXIO; 351 352 if (ctx->submitter_task && ctx->submitter_task != current) 353 return -EEXIST; 354 355 if (ctx->restricted) { 356 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 357 if (!test_bit(opcode, ctx->restrictions.register_op)) 358 return -EACCES; 359 } 360 361 switch (opcode) { 362 case IORING_REGISTER_BUFFERS: 363 ret = -EFAULT; 364 if (!arg) 365 break; 366 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 367 break; 368 case IORING_UNREGISTER_BUFFERS: 369 ret = -EINVAL; 370 if (arg || nr_args) 371 break; 372 ret = io_sqe_buffers_unregister(ctx); 373 break; 374 case IORING_REGISTER_FILES: 375 ret = -EFAULT; 376 if (!arg) 377 break; 378 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 379 break; 380 case IORING_UNREGISTER_FILES: 381 ret = -EINVAL; 382 if (arg || nr_args) 383 break; 384 ret = io_sqe_files_unregister(ctx); 385 break; 386 case IORING_REGISTER_FILES_UPDATE: 387 ret = io_register_files_update(ctx, arg, nr_args); 388 break; 389 case IORING_REGISTER_EVENTFD: 390 ret = -EINVAL; 391 if (nr_args != 1) 392 break; 393 ret = io_eventfd_register(ctx, arg, 0); 394 break; 395 case IORING_REGISTER_EVENTFD_ASYNC: 396 ret = -EINVAL; 397 if (nr_args != 1) 398 break; 399 ret = io_eventfd_register(ctx, arg, 1); 400 break; 401 case IORING_UNREGISTER_EVENTFD: 402 ret = -EINVAL; 403 if (arg || nr_args) 404 break; 405 ret = io_eventfd_unregister(ctx); 406 break; 407 case IORING_REGISTER_PROBE: 408 ret = -EINVAL; 409 if (!arg || nr_args > 256) 410 break; 411 ret = io_probe(ctx, arg, nr_args); 412 break; 413 case IORING_REGISTER_PERSONALITY: 414 ret = -EINVAL; 415 if (arg || nr_args) 416 break; 417 ret = io_register_personality(ctx); 418 break; 419 case IORING_UNREGISTER_PERSONALITY: 420 ret = -EINVAL; 421 if (arg) 422 break; 423 ret = io_unregister_personality(ctx, nr_args); 424 break; 425 case IORING_REGISTER_ENABLE_RINGS: 426 ret = -EINVAL; 427 if (arg || nr_args) 428 break; 429 ret = io_register_enable_rings(ctx); 430 break; 431 case IORING_REGISTER_RESTRICTIONS: 432 ret = io_register_restrictions(ctx, arg, nr_args); 433 break; 434 case IORING_REGISTER_FILES2: 435 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 436 break; 437 case IORING_REGISTER_FILES_UPDATE2: 438 ret = io_register_rsrc_update(ctx, arg, nr_args, 439 IORING_RSRC_FILE); 440 break; 441 case IORING_REGISTER_BUFFERS2: 442 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 443 break; 444 case IORING_REGISTER_BUFFERS_UPDATE: 445 ret = io_register_rsrc_update(ctx, arg, nr_args, 446 IORING_RSRC_BUFFER); 447 break; 448 case IORING_REGISTER_IOWQ_AFF: 449 ret = -EINVAL; 450 if (!arg || !nr_args) 451 break; 452 ret = io_register_iowq_aff(ctx, arg, nr_args); 453 break; 454 case IORING_UNREGISTER_IOWQ_AFF: 455 ret = -EINVAL; 456 if (arg || nr_args) 457 break; 458 ret = io_unregister_iowq_aff(ctx); 459 break; 460 case IORING_REGISTER_IOWQ_MAX_WORKERS: 461 ret = -EINVAL; 462 if (!arg || nr_args != 2) 463 break; 464 ret = io_register_iowq_max_workers(ctx, arg); 465 break; 466 case IORING_REGISTER_RING_FDS: 467 ret = io_ringfd_register(ctx, arg, nr_args); 468 break; 469 case IORING_UNREGISTER_RING_FDS: 470 ret = io_ringfd_unregister(ctx, arg, nr_args); 471 break; 472 case IORING_REGISTER_PBUF_RING: 473 ret = -EINVAL; 474 if (!arg || nr_args != 1) 475 break; 476 ret = io_register_pbuf_ring(ctx, arg); 477 break; 478 case IORING_UNREGISTER_PBUF_RING: 479 ret = -EINVAL; 480 if (!arg || nr_args != 1) 481 break; 482 ret = io_unregister_pbuf_ring(ctx, arg); 483 break; 484 case IORING_REGISTER_SYNC_CANCEL: 485 ret = -EINVAL; 486 if (!arg || nr_args != 1) 487 break; 488 ret = io_sync_cancel(ctx, arg); 489 break; 490 case IORING_REGISTER_FILE_ALLOC_RANGE: 491 ret = -EINVAL; 492 if (!arg || nr_args) 493 break; 494 ret = io_register_file_alloc_range(ctx, arg); 495 break; 496 case IORING_REGISTER_PBUF_STATUS: 497 ret = -EINVAL; 498 if (!arg || nr_args != 1) 499 break; 500 ret = io_register_pbuf_status(ctx, arg); 501 break; 502 case IORING_REGISTER_NAPI: 503 ret = -EINVAL; 504 if (!arg || nr_args != 1) 505 break; 506 ret = io_register_napi(ctx, arg); 507 break; 508 case IORING_UNREGISTER_NAPI: 509 ret = -EINVAL; 510 if (nr_args != 1) 511 break; 512 ret = io_unregister_napi(ctx, arg); 513 break; 514 default: 515 ret = -EINVAL; 516 break; 517 } 518 519 return ret; 520 } 521 522 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 523 void __user *, arg, unsigned int, nr_args) 524 { 525 struct io_ring_ctx *ctx; 526 long ret = -EBADF; 527 struct file *file; 528 bool use_registered_ring; 529 530 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 531 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 532 533 if (opcode >= IORING_REGISTER_LAST) 534 return -EINVAL; 535 536 if (use_registered_ring) { 537 /* 538 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 539 * need only dereference our task private array to find it. 540 */ 541 struct io_uring_task *tctx = current->io_uring; 542 543 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 544 return -EINVAL; 545 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 546 file = tctx->registered_rings[fd]; 547 if (unlikely(!file)) 548 return -EBADF; 549 } else { 550 file = fget(fd); 551 if (unlikely(!file)) 552 return -EBADF; 553 ret = -EOPNOTSUPP; 554 if (!io_is_uring_fops(file)) 555 goto out_fput; 556 } 557 558 ctx = file->private_data; 559 560 mutex_lock(&ctx->uring_lock); 561 ret = __io_uring_register(ctx, opcode, arg, nr_args); 562 mutex_unlock(&ctx->uring_lock); 563 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 564 out_fput: 565 if (!use_registered_ring) 566 fput(file); 567 return ret; 568 } 569
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.