1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 #include <linux/bpf-cgroup.h> 120 121 static atomic_long_t unix_nr_socks; 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 124 125 /* SMP locking strategy: 126 * hash table is protected with spinlock. 127 * each socket state is protected by separate spinlock. 128 */ 129 #ifdef CONFIG_PROVE_LOCKING 130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 131 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 133 const struct lockdep_map *b) 134 { 135 return cmp_ptr(a, b); 136 } 137 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 139 const struct lockdep_map *_b) 140 { 141 const struct unix_sock *a, *b; 142 143 a = container_of(_a, struct unix_sock, lock.dep_map); 144 b = container_of(_b, struct unix_sock, lock.dep_map); 145 146 if (a->sk.sk_state == TCP_LISTEN) { 147 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 148 * 149 * 1. a is TCP_LISTEN. 150 * 2. b is not a. 151 * 3. concurrent connect(b -> a) must fail. 152 * 153 * Except for 2. & 3., the b's state can be any possible 154 * value due to concurrent connect() or listen(). 155 * 156 * 2. is detected in debug_spin_lock_before(), and 3. cannot 157 * be expressed as lock_cmp_fn. 158 */ 159 switch (b->sk.sk_state) { 160 case TCP_CLOSE: 161 case TCP_ESTABLISHED: 162 case TCP_LISTEN: 163 return -1; 164 default: 165 /* Invalid case. */ 166 return 0; 167 } 168 } 169 170 /* Should never happen. Just to be symmetric. */ 171 if (b->sk.sk_state == TCP_LISTEN) { 172 switch (b->sk.sk_state) { 173 case TCP_CLOSE: 174 case TCP_ESTABLISHED: 175 return 1; 176 default: 177 return 0; 178 } 179 } 180 181 /* unix_state_double_lock(): ascending address order. */ 182 return cmp_ptr(a, b); 183 } 184 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 186 const struct lockdep_map *_b) 187 { 188 const struct sock *a, *b; 189 190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 192 193 /* unix_collect_skb(): listener -> embryo order. */ 194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 195 return -1; 196 197 /* Should never happen. Just to be symmetric. */ 198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 199 return 1; 200 201 return 0; 202 } 203 #endif 204 205 static unsigned int unix_unbound_hash(struct sock *sk) 206 { 207 unsigned long hash = (unsigned long)sk; 208 209 hash ^= hash >> 16; 210 hash ^= hash >> 8; 211 hash ^= sk->sk_type; 212 213 return hash & UNIX_HASH_MOD; 214 } 215 216 static unsigned int unix_bsd_hash(struct inode *i) 217 { 218 return i->i_ino & UNIX_HASH_MOD; 219 } 220 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 222 int addr_len, int type) 223 { 224 __wsum csum = csum_partial(sunaddr, addr_len, 0); 225 unsigned int hash; 226 227 hash = (__force unsigned int)csum_fold(csum); 228 hash ^= hash >> 8; 229 hash ^= type; 230 231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 232 } 233 234 static void unix_table_double_lock(struct net *net, 235 unsigned int hash1, unsigned int hash2) 236 { 237 if (hash1 == hash2) { 238 spin_lock(&net->unx.table.locks[hash1]); 239 return; 240 } 241 242 if (hash1 > hash2) 243 swap(hash1, hash2); 244 245 spin_lock(&net->unx.table.locks[hash1]); 246 spin_lock(&net->unx.table.locks[hash2]); 247 } 248 249 static void unix_table_double_unlock(struct net *net, 250 unsigned int hash1, unsigned int hash2) 251 { 252 if (hash1 == hash2) { 253 spin_unlock(&net->unx.table.locks[hash1]); 254 return; 255 } 256 257 spin_unlock(&net->unx.table.locks[hash1]); 258 spin_unlock(&net->unx.table.locks[hash2]); 259 } 260 261 #ifdef CONFIG_SECURITY_NETWORK 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 263 { 264 UNIXCB(skb).secid = scm->secid; 265 } 266 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 268 { 269 scm->secid = UNIXCB(skb).secid; 270 } 271 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 273 { 274 return (scm->secid == UNIXCB(skb).secid); 275 } 276 #else 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 278 { } 279 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 281 { } 282 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 284 { 285 return true; 286 } 287 #endif /* CONFIG_SECURITY_NETWORK */ 288 289 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 290 { 291 return unix_peer(osk) == sk; 292 } 293 294 static inline int unix_may_send(struct sock *sk, struct sock *osk) 295 { 296 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 297 } 298 299 static inline int unix_recvq_full_lockless(const struct sock *sk) 300 { 301 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 302 } 303 304 struct sock *unix_peer_get(struct sock *s) 305 { 306 struct sock *peer; 307 308 unix_state_lock(s); 309 peer = unix_peer(s); 310 if (peer) 311 sock_hold(peer); 312 unix_state_unlock(s); 313 return peer; 314 } 315 EXPORT_SYMBOL_GPL(unix_peer_get); 316 317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 318 int addr_len) 319 { 320 struct unix_address *addr; 321 322 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 323 if (!addr) 324 return NULL; 325 326 refcount_set(&addr->refcnt, 1); 327 addr->len = addr_len; 328 memcpy(addr->name, sunaddr, addr_len); 329 330 return addr; 331 } 332 333 static inline void unix_release_addr(struct unix_address *addr) 334 { 335 if (refcount_dec_and_test(&addr->refcnt)) 336 kfree(addr); 337 } 338 339 /* 340 * Check unix socket name: 341 * - should be not zero length. 342 * - if started by not zero, should be NULL terminated (FS object) 343 * - if started by zero, it is abstract name. 344 */ 345 346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 347 { 348 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 349 addr_len > sizeof(*sunaddr)) 350 return -EINVAL; 351 352 if (sunaddr->sun_family != AF_UNIX) 353 return -EINVAL; 354 355 return 0; 356 } 357 358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 359 { 360 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 361 short offset = offsetof(struct sockaddr_storage, __data); 362 363 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 364 365 /* This may look like an off by one error but it is a bit more 366 * subtle. 108 is the longest valid AF_UNIX path for a binding. 367 * sun_path[108] doesn't as such exist. However in kernel space 368 * we are guaranteed that it is a valid memory location in our 369 * kernel address buffer because syscall functions always pass 370 * a pointer of struct sockaddr_storage which has a bigger buffer 371 * than 108. Also, we must terminate sun_path for strlen() in 372 * getname_kernel(). 373 */ 374 addr->__data[addr_len - offset] = 0; 375 376 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 377 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 378 * know the actual buffer. 379 */ 380 return strlen(addr->__data) + offset + 1; 381 } 382 383 static void __unix_remove_socket(struct sock *sk) 384 { 385 sk_del_node_init(sk); 386 } 387 388 static void __unix_insert_socket(struct net *net, struct sock *sk) 389 { 390 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 391 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 392 } 393 394 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 395 struct unix_address *addr, unsigned int hash) 396 { 397 __unix_remove_socket(sk); 398 smp_store_release(&unix_sk(sk)->addr, addr); 399 400 sk->sk_hash = hash; 401 __unix_insert_socket(net, sk); 402 } 403 404 static void unix_remove_socket(struct net *net, struct sock *sk) 405 { 406 spin_lock(&net->unx.table.locks[sk->sk_hash]); 407 __unix_remove_socket(sk); 408 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 409 } 410 411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 412 { 413 spin_lock(&net->unx.table.locks[sk->sk_hash]); 414 __unix_insert_socket(net, sk); 415 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 416 } 417 418 static void unix_insert_bsd_socket(struct sock *sk) 419 { 420 spin_lock(&bsd_socket_locks[sk->sk_hash]); 421 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 422 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 423 } 424 425 static void unix_remove_bsd_socket(struct sock *sk) 426 { 427 if (!hlist_unhashed(&sk->sk_bind_node)) { 428 spin_lock(&bsd_socket_locks[sk->sk_hash]); 429 __sk_del_bind_node(sk); 430 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 431 432 sk_node_init(&sk->sk_bind_node); 433 } 434 } 435 436 static struct sock *__unix_find_socket_byname(struct net *net, 437 struct sockaddr_un *sunname, 438 int len, unsigned int hash) 439 { 440 struct sock *s; 441 442 sk_for_each(s, &net->unx.table.buckets[hash]) { 443 struct unix_sock *u = unix_sk(s); 444 445 if (u->addr->len == len && 446 !memcmp(u->addr->name, sunname, len)) 447 return s; 448 } 449 return NULL; 450 } 451 452 static inline struct sock *unix_find_socket_byname(struct net *net, 453 struct sockaddr_un *sunname, 454 int len, unsigned int hash) 455 { 456 struct sock *s; 457 458 spin_lock(&net->unx.table.locks[hash]); 459 s = __unix_find_socket_byname(net, sunname, len, hash); 460 if (s) 461 sock_hold(s); 462 spin_unlock(&net->unx.table.locks[hash]); 463 return s; 464 } 465 466 static struct sock *unix_find_socket_byinode(struct inode *i) 467 { 468 unsigned int hash = unix_bsd_hash(i); 469 struct sock *s; 470 471 spin_lock(&bsd_socket_locks[hash]); 472 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 473 struct dentry *dentry = unix_sk(s)->path.dentry; 474 475 if (dentry && d_backing_inode(dentry) == i) { 476 sock_hold(s); 477 spin_unlock(&bsd_socket_locks[hash]); 478 return s; 479 } 480 } 481 spin_unlock(&bsd_socket_locks[hash]); 482 return NULL; 483 } 484 485 /* Support code for asymmetrically connected dgram sockets 486 * 487 * If a datagram socket is connected to a socket not itself connected 488 * to the first socket (eg, /dev/log), clients may only enqueue more 489 * messages if the present receive queue of the server socket is not 490 * "too large". This means there's a second writeability condition 491 * poll and sendmsg need to test. The dgram recv code will do a wake 492 * up on the peer_wait wait queue of a socket upon reception of a 493 * datagram which needs to be propagated to sleeping would-be writers 494 * since these might not have sent anything so far. This can't be 495 * accomplished via poll_wait because the lifetime of the server 496 * socket might be less than that of its clients if these break their 497 * association with it or if the server socket is closed while clients 498 * are still connected to it and there's no way to inform "a polling 499 * implementation" that it should let go of a certain wait queue 500 * 501 * In order to propagate a wake up, a wait_queue_entry_t of the client 502 * socket is enqueued on the peer_wait queue of the server socket 503 * whose wake function does a wake_up on the ordinary client socket 504 * wait queue. This connection is established whenever a write (or 505 * poll for write) hit the flow control condition and broken when the 506 * association to the server socket is dissolved or after a wake up 507 * was relayed. 508 */ 509 510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 511 void *key) 512 { 513 struct unix_sock *u; 514 wait_queue_head_t *u_sleep; 515 516 u = container_of(q, struct unix_sock, peer_wake); 517 518 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 519 q); 520 u->peer_wake.private = NULL; 521 522 /* relaying can only happen while the wq still exists */ 523 u_sleep = sk_sleep(&u->sk); 524 if (u_sleep) 525 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 526 527 return 0; 528 } 529 530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 531 { 532 struct unix_sock *u, *u_other; 533 int rc; 534 535 u = unix_sk(sk); 536 u_other = unix_sk(other); 537 rc = 0; 538 spin_lock(&u_other->peer_wait.lock); 539 540 if (!u->peer_wake.private) { 541 u->peer_wake.private = other; 542 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 543 544 rc = 1; 545 } 546 547 spin_unlock(&u_other->peer_wait.lock); 548 return rc; 549 } 550 551 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 552 struct sock *other) 553 { 554 struct unix_sock *u, *u_other; 555 556 u = unix_sk(sk); 557 u_other = unix_sk(other); 558 spin_lock(&u_other->peer_wait.lock); 559 560 if (u->peer_wake.private == other) { 561 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 562 u->peer_wake.private = NULL; 563 } 564 565 spin_unlock(&u_other->peer_wait.lock); 566 } 567 568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 569 struct sock *other) 570 { 571 unix_dgram_peer_wake_disconnect(sk, other); 572 wake_up_interruptible_poll(sk_sleep(sk), 573 EPOLLOUT | 574 EPOLLWRNORM | 575 EPOLLWRBAND); 576 } 577 578 /* preconditions: 579 * - unix_peer(sk) == other 580 * - association is stable 581 */ 582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 583 { 584 int connected; 585 586 connected = unix_dgram_peer_wake_connect(sk, other); 587 588 /* If other is SOCK_DEAD, we want to make sure we signal 589 * POLLOUT, such that a subsequent write() can get a 590 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 591 * to other and its full, we will hang waiting for POLLOUT. 592 */ 593 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 594 return 1; 595 596 if (connected) 597 unix_dgram_peer_wake_disconnect(sk, other); 598 599 return 0; 600 } 601 602 static int unix_writable(const struct sock *sk, unsigned char state) 603 { 604 return state != TCP_LISTEN && 605 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 606 } 607 608 static void unix_write_space(struct sock *sk) 609 { 610 struct socket_wq *wq; 611 612 rcu_read_lock(); 613 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 614 wq = rcu_dereference(sk->sk_wq); 615 if (skwq_has_sleeper(wq)) 616 wake_up_interruptible_sync_poll(&wq->wait, 617 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 618 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 619 } 620 rcu_read_unlock(); 621 } 622 623 /* When dgram socket disconnects (or changes its peer), we clear its receive 624 * queue of packets arrived from previous peer. First, it allows to do 625 * flow control based only on wmem_alloc; second, sk connected to peer 626 * may receive messages only from that peer. */ 627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 628 { 629 if (!skb_queue_empty(&sk->sk_receive_queue)) { 630 skb_queue_purge(&sk->sk_receive_queue); 631 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 632 633 /* If one link of bidirectional dgram pipe is disconnected, 634 * we signal error. Messages are lost. Do not make this, 635 * when peer was not connected to us. 636 */ 637 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 638 WRITE_ONCE(other->sk_err, ECONNRESET); 639 sk_error_report(other); 640 } 641 } 642 } 643 644 static void unix_sock_destructor(struct sock *sk) 645 { 646 struct unix_sock *u = unix_sk(sk); 647 648 skb_queue_purge(&sk->sk_receive_queue); 649 650 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 651 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 652 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 653 if (!sock_flag(sk, SOCK_DEAD)) { 654 pr_info("Attempt to release alive unix socket: %p\n", sk); 655 return; 656 } 657 658 if (u->addr) 659 unix_release_addr(u->addr); 660 661 atomic_long_dec(&unix_nr_socks); 662 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 663 #ifdef UNIX_REFCNT_DEBUG 664 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 665 atomic_long_read(&unix_nr_socks)); 666 #endif 667 } 668 669 static void unix_release_sock(struct sock *sk, int embrion) 670 { 671 struct unix_sock *u = unix_sk(sk); 672 struct sock *skpair; 673 struct sk_buff *skb; 674 struct path path; 675 int state; 676 677 unix_remove_socket(sock_net(sk), sk); 678 unix_remove_bsd_socket(sk); 679 680 /* Clear state */ 681 unix_state_lock(sk); 682 sock_orphan(sk); 683 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 684 path = u->path; 685 u->path.dentry = NULL; 686 u->path.mnt = NULL; 687 state = sk->sk_state; 688 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 689 690 skpair = unix_peer(sk); 691 unix_peer(sk) = NULL; 692 693 unix_state_unlock(sk); 694 695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 696 if (u->oob_skb) { 697 kfree_skb(u->oob_skb); 698 u->oob_skb = NULL; 699 } 700 #endif 701 702 wake_up_interruptible_all(&u->peer_wait); 703 704 if (skpair != NULL) { 705 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 706 unix_state_lock(skpair); 707 /* No more writes */ 708 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 709 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 710 WRITE_ONCE(skpair->sk_err, ECONNRESET); 711 unix_state_unlock(skpair); 712 skpair->sk_state_change(skpair); 713 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 714 } 715 716 unix_dgram_peer_wake_disconnect(sk, skpair); 717 sock_put(skpair); /* It may now die */ 718 } 719 720 /* Try to flush out this socket. Throw out buffers at least */ 721 722 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 723 if (state == TCP_LISTEN) 724 unix_release_sock(skb->sk, 1); 725 726 /* passed fds are erased in the kfree_skb hook */ 727 kfree_skb(skb); 728 } 729 730 if (path.dentry) 731 path_put(&path); 732 733 sock_put(sk); 734 735 /* ---- Socket is dead now and most probably destroyed ---- */ 736 737 /* 738 * Fixme: BSD difference: In BSD all sockets connected to us get 739 * ECONNRESET and we die on the spot. In Linux we behave 740 * like files and pipes do and wait for the last 741 * dereference. 742 * 743 * Can't we simply set sock->err? 744 * 745 * What the above comment does talk about? --ANK(980817) 746 */ 747 748 if (READ_ONCE(unix_tot_inflight)) 749 unix_gc(); /* Garbage collect fds */ 750 } 751 752 static void init_peercred(struct sock *sk) 753 { 754 sk->sk_peer_pid = get_pid(task_tgid(current)); 755 sk->sk_peer_cred = get_current_cred(); 756 } 757 758 static void update_peercred(struct sock *sk) 759 { 760 const struct cred *old_cred; 761 struct pid *old_pid; 762 763 spin_lock(&sk->sk_peer_lock); 764 old_pid = sk->sk_peer_pid; 765 old_cred = sk->sk_peer_cred; 766 init_peercred(sk); 767 spin_unlock(&sk->sk_peer_lock); 768 769 put_pid(old_pid); 770 put_cred(old_cred); 771 } 772 773 static void copy_peercred(struct sock *sk, struct sock *peersk) 774 { 775 lockdep_assert_held(&unix_sk(peersk)->lock); 776 777 spin_lock(&sk->sk_peer_lock); 778 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 779 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 780 spin_unlock(&sk->sk_peer_lock); 781 } 782 783 static int unix_listen(struct socket *sock, int backlog) 784 { 785 int err; 786 struct sock *sk = sock->sk; 787 struct unix_sock *u = unix_sk(sk); 788 789 err = -EOPNOTSUPP; 790 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 791 goto out; /* Only stream/seqpacket sockets accept */ 792 err = -EINVAL; 793 if (!READ_ONCE(u->addr)) 794 goto out; /* No listens on an unbound socket */ 795 unix_state_lock(sk); 796 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 797 goto out_unlock; 798 if (backlog > sk->sk_max_ack_backlog) 799 wake_up_interruptible_all(&u->peer_wait); 800 sk->sk_max_ack_backlog = backlog; 801 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 802 803 /* set credentials so connect can copy them */ 804 update_peercred(sk); 805 err = 0; 806 807 out_unlock: 808 unix_state_unlock(sk); 809 out: 810 return err; 811 } 812 813 static int unix_release(struct socket *); 814 static int unix_bind(struct socket *, struct sockaddr *, int); 815 static int unix_stream_connect(struct socket *, struct sockaddr *, 816 int addr_len, int flags); 817 static int unix_socketpair(struct socket *, struct socket *); 818 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 819 static int unix_getname(struct socket *, struct sockaddr *, int); 820 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 821 static __poll_t unix_dgram_poll(struct file *, struct socket *, 822 poll_table *); 823 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 824 #ifdef CONFIG_COMPAT 825 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 826 #endif 827 static int unix_shutdown(struct socket *, int); 828 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 829 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 830 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 831 struct pipe_inode_info *, size_t size, 832 unsigned int flags); 833 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 834 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 835 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 836 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 837 static int unix_dgram_connect(struct socket *, struct sockaddr *, 838 int, int); 839 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 840 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 841 int); 842 843 #ifdef CONFIG_PROC_FS 844 static int unix_count_nr_fds(struct sock *sk) 845 { 846 struct sk_buff *skb; 847 struct unix_sock *u; 848 int nr_fds = 0; 849 850 spin_lock(&sk->sk_receive_queue.lock); 851 skb = skb_peek(&sk->sk_receive_queue); 852 while (skb) { 853 u = unix_sk(skb->sk); 854 nr_fds += atomic_read(&u->scm_stat.nr_fds); 855 skb = skb_peek_next(skb, &sk->sk_receive_queue); 856 } 857 spin_unlock(&sk->sk_receive_queue.lock); 858 859 return nr_fds; 860 } 861 862 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 863 { 864 struct sock *sk = sock->sk; 865 unsigned char s_state; 866 struct unix_sock *u; 867 int nr_fds = 0; 868 869 if (sk) { 870 s_state = READ_ONCE(sk->sk_state); 871 u = unix_sk(sk); 872 873 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 874 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 875 * SOCK_DGRAM is ordinary. So, no lock is needed. 876 */ 877 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 878 nr_fds = atomic_read(&u->scm_stat.nr_fds); 879 else if (s_state == TCP_LISTEN) 880 nr_fds = unix_count_nr_fds(sk); 881 882 seq_printf(m, "scm_fds: %u\n", nr_fds); 883 } 884 } 885 #else 886 #define unix_show_fdinfo NULL 887 #endif 888 889 static const struct proto_ops unix_stream_ops = { 890 .family = PF_UNIX, 891 .owner = THIS_MODULE, 892 .release = unix_release, 893 .bind = unix_bind, 894 .connect = unix_stream_connect, 895 .socketpair = unix_socketpair, 896 .accept = unix_accept, 897 .getname = unix_getname, 898 .poll = unix_poll, 899 .ioctl = unix_ioctl, 900 #ifdef CONFIG_COMPAT 901 .compat_ioctl = unix_compat_ioctl, 902 #endif 903 .listen = unix_listen, 904 .shutdown = unix_shutdown, 905 .sendmsg = unix_stream_sendmsg, 906 .recvmsg = unix_stream_recvmsg, 907 .read_skb = unix_stream_read_skb, 908 .mmap = sock_no_mmap, 909 .splice_read = unix_stream_splice_read, 910 .set_peek_off = sk_set_peek_off, 911 .show_fdinfo = unix_show_fdinfo, 912 }; 913 914 static const struct proto_ops unix_dgram_ops = { 915 .family = PF_UNIX, 916 .owner = THIS_MODULE, 917 .release = unix_release, 918 .bind = unix_bind, 919 .connect = unix_dgram_connect, 920 .socketpair = unix_socketpair, 921 .accept = sock_no_accept, 922 .getname = unix_getname, 923 .poll = unix_dgram_poll, 924 .ioctl = unix_ioctl, 925 #ifdef CONFIG_COMPAT 926 .compat_ioctl = unix_compat_ioctl, 927 #endif 928 .listen = sock_no_listen, 929 .shutdown = unix_shutdown, 930 .sendmsg = unix_dgram_sendmsg, 931 .read_skb = unix_read_skb, 932 .recvmsg = unix_dgram_recvmsg, 933 .mmap = sock_no_mmap, 934 .set_peek_off = sk_set_peek_off, 935 .show_fdinfo = unix_show_fdinfo, 936 }; 937 938 static const struct proto_ops unix_seqpacket_ops = { 939 .family = PF_UNIX, 940 .owner = THIS_MODULE, 941 .release = unix_release, 942 .bind = unix_bind, 943 .connect = unix_stream_connect, 944 .socketpair = unix_socketpair, 945 .accept = unix_accept, 946 .getname = unix_getname, 947 .poll = unix_dgram_poll, 948 .ioctl = unix_ioctl, 949 #ifdef CONFIG_COMPAT 950 .compat_ioctl = unix_compat_ioctl, 951 #endif 952 .listen = unix_listen, 953 .shutdown = unix_shutdown, 954 .sendmsg = unix_seqpacket_sendmsg, 955 .recvmsg = unix_seqpacket_recvmsg, 956 .mmap = sock_no_mmap, 957 .set_peek_off = sk_set_peek_off, 958 .show_fdinfo = unix_show_fdinfo, 959 }; 960 961 static void unix_close(struct sock *sk, long timeout) 962 { 963 /* Nothing to do here, unix socket does not need a ->close(). 964 * This is merely for sockmap. 965 */ 966 } 967 968 static void unix_unhash(struct sock *sk) 969 { 970 /* Nothing to do here, unix socket does not need a ->unhash(). 971 * This is merely for sockmap. 972 */ 973 } 974 975 static bool unix_bpf_bypass_getsockopt(int level, int optname) 976 { 977 if (level == SOL_SOCKET) { 978 switch (optname) { 979 case SO_PEERPIDFD: 980 return true; 981 default: 982 return false; 983 } 984 } 985 986 return false; 987 } 988 989 struct proto unix_dgram_proto = { 990 .name = "UNIX", 991 .owner = THIS_MODULE, 992 .obj_size = sizeof(struct unix_sock), 993 .close = unix_close, 994 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 995 #ifdef CONFIG_BPF_SYSCALL 996 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 997 #endif 998 }; 999 1000 struct proto unix_stream_proto = { 1001 .name = "UNIX-STREAM", 1002 .owner = THIS_MODULE, 1003 .obj_size = sizeof(struct unix_sock), 1004 .close = unix_close, 1005 .unhash = unix_unhash, 1006 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 1007 #ifdef CONFIG_BPF_SYSCALL 1008 .psock_update_sk_prot = unix_stream_bpf_update_proto, 1009 #endif 1010 }; 1011 1012 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 1013 { 1014 struct unix_sock *u; 1015 struct sock *sk; 1016 int err; 1017 1018 atomic_long_inc(&unix_nr_socks); 1019 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 1020 err = -ENFILE; 1021 goto err; 1022 } 1023 1024 if (type == SOCK_STREAM) 1025 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1026 else /*dgram and seqpacket */ 1027 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1028 1029 if (!sk) { 1030 err = -ENOMEM; 1031 goto err; 1032 } 1033 1034 sock_init_data(sock, sk); 1035 1036 sk->sk_hash = unix_unbound_hash(sk); 1037 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1038 sk->sk_write_space = unix_write_space; 1039 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1040 sk->sk_destruct = unix_sock_destructor; 1041 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1042 1043 u = unix_sk(sk); 1044 u->listener = NULL; 1045 u->vertex = NULL; 1046 u->path.dentry = NULL; 1047 u->path.mnt = NULL; 1048 spin_lock_init(&u->lock); 1049 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1050 mutex_init(&u->iolock); /* single task reading lock */ 1051 mutex_init(&u->bindlock); /* single task binding lock */ 1052 init_waitqueue_head(&u->peer_wait); 1053 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1054 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1055 unix_insert_unbound_socket(net, sk); 1056 1057 sock_prot_inuse_add(net, sk->sk_prot, 1); 1058 1059 return sk; 1060 1061 err: 1062 atomic_long_dec(&unix_nr_socks); 1063 return ERR_PTR(err); 1064 } 1065 1066 static int unix_create(struct net *net, struct socket *sock, int protocol, 1067 int kern) 1068 { 1069 struct sock *sk; 1070 1071 if (protocol && protocol != PF_UNIX) 1072 return -EPROTONOSUPPORT; 1073 1074 sock->state = SS_UNCONNECTED; 1075 1076 switch (sock->type) { 1077 case SOCK_STREAM: 1078 sock->ops = &unix_stream_ops; 1079 break; 1080 /* 1081 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1082 * nothing uses it. 1083 */ 1084 case SOCK_RAW: 1085 sock->type = SOCK_DGRAM; 1086 fallthrough; 1087 case SOCK_DGRAM: 1088 sock->ops = &unix_dgram_ops; 1089 break; 1090 case SOCK_SEQPACKET: 1091 sock->ops = &unix_seqpacket_ops; 1092 break; 1093 default: 1094 return -ESOCKTNOSUPPORT; 1095 } 1096 1097 sk = unix_create1(net, sock, kern, sock->type); 1098 if (IS_ERR(sk)) 1099 return PTR_ERR(sk); 1100 1101 return 0; 1102 } 1103 1104 static int unix_release(struct socket *sock) 1105 { 1106 struct sock *sk = sock->sk; 1107 1108 if (!sk) 1109 return 0; 1110 1111 sk->sk_prot->close(sk, 0); 1112 unix_release_sock(sk, 0); 1113 sock->sk = NULL; 1114 1115 return 0; 1116 } 1117 1118 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1119 int type) 1120 { 1121 struct inode *inode; 1122 struct path path; 1123 struct sock *sk; 1124 int err; 1125 1126 unix_mkname_bsd(sunaddr, addr_len); 1127 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1128 if (err) 1129 goto fail; 1130 1131 err = path_permission(&path, MAY_WRITE); 1132 if (err) 1133 goto path_put; 1134 1135 err = -ECONNREFUSED; 1136 inode = d_backing_inode(path.dentry); 1137 if (!S_ISSOCK(inode->i_mode)) 1138 goto path_put; 1139 1140 sk = unix_find_socket_byinode(inode); 1141 if (!sk) 1142 goto path_put; 1143 1144 err = -EPROTOTYPE; 1145 if (sk->sk_type == type) 1146 touch_atime(&path); 1147 else 1148 goto sock_put; 1149 1150 path_put(&path); 1151 1152 return sk; 1153 1154 sock_put: 1155 sock_put(sk); 1156 path_put: 1157 path_put(&path); 1158 fail: 1159 return ERR_PTR(err); 1160 } 1161 1162 static struct sock *unix_find_abstract(struct net *net, 1163 struct sockaddr_un *sunaddr, 1164 int addr_len, int type) 1165 { 1166 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1167 struct dentry *dentry; 1168 struct sock *sk; 1169 1170 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1171 if (!sk) 1172 return ERR_PTR(-ECONNREFUSED); 1173 1174 dentry = unix_sk(sk)->path.dentry; 1175 if (dentry) 1176 touch_atime(&unix_sk(sk)->path); 1177 1178 return sk; 1179 } 1180 1181 static struct sock *unix_find_other(struct net *net, 1182 struct sockaddr_un *sunaddr, 1183 int addr_len, int type) 1184 { 1185 struct sock *sk; 1186 1187 if (sunaddr->sun_path[0]) 1188 sk = unix_find_bsd(sunaddr, addr_len, type); 1189 else 1190 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1191 1192 return sk; 1193 } 1194 1195 static int unix_autobind(struct sock *sk) 1196 { 1197 struct unix_sock *u = unix_sk(sk); 1198 unsigned int new_hash, old_hash; 1199 struct net *net = sock_net(sk); 1200 struct unix_address *addr; 1201 u32 lastnum, ordernum; 1202 int err; 1203 1204 err = mutex_lock_interruptible(&u->bindlock); 1205 if (err) 1206 return err; 1207 1208 if (u->addr) 1209 goto out; 1210 1211 err = -ENOMEM; 1212 addr = kzalloc(sizeof(*addr) + 1213 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1214 if (!addr) 1215 goto out; 1216 1217 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1218 addr->name->sun_family = AF_UNIX; 1219 refcount_set(&addr->refcnt, 1); 1220 1221 old_hash = sk->sk_hash; 1222 ordernum = get_random_u32(); 1223 lastnum = ordernum & 0xFFFFF; 1224 retry: 1225 ordernum = (ordernum + 1) & 0xFFFFF; 1226 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1227 1228 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1229 unix_table_double_lock(net, old_hash, new_hash); 1230 1231 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1232 unix_table_double_unlock(net, old_hash, new_hash); 1233 1234 /* __unix_find_socket_byname() may take long time if many names 1235 * are already in use. 1236 */ 1237 cond_resched(); 1238 1239 if (ordernum == lastnum) { 1240 /* Give up if all names seems to be in use. */ 1241 err = -ENOSPC; 1242 unix_release_addr(addr); 1243 goto out; 1244 } 1245 1246 goto retry; 1247 } 1248 1249 __unix_set_addr_hash(net, sk, addr, new_hash); 1250 unix_table_double_unlock(net, old_hash, new_hash); 1251 err = 0; 1252 1253 out: mutex_unlock(&u->bindlock); 1254 return err; 1255 } 1256 1257 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1258 int addr_len) 1259 { 1260 umode_t mode = S_IFSOCK | 1261 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1262 struct unix_sock *u = unix_sk(sk); 1263 unsigned int new_hash, old_hash; 1264 struct net *net = sock_net(sk); 1265 struct mnt_idmap *idmap; 1266 struct unix_address *addr; 1267 struct dentry *dentry; 1268 struct path parent; 1269 int err; 1270 1271 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1272 addr = unix_create_addr(sunaddr, addr_len); 1273 if (!addr) 1274 return -ENOMEM; 1275 1276 /* 1277 * Get the parent directory, calculate the hash for last 1278 * component. 1279 */ 1280 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1281 if (IS_ERR(dentry)) { 1282 err = PTR_ERR(dentry); 1283 goto out; 1284 } 1285 1286 /* 1287 * All right, let's create it. 1288 */ 1289 idmap = mnt_idmap(parent.mnt); 1290 err = security_path_mknod(&parent, dentry, mode, 0); 1291 if (!err) 1292 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1293 if (err) 1294 goto out_path; 1295 err = mutex_lock_interruptible(&u->bindlock); 1296 if (err) 1297 goto out_unlink; 1298 if (u->addr) 1299 goto out_unlock; 1300 1301 old_hash = sk->sk_hash; 1302 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1303 unix_table_double_lock(net, old_hash, new_hash); 1304 u->path.mnt = mntget(parent.mnt); 1305 u->path.dentry = dget(dentry); 1306 __unix_set_addr_hash(net, sk, addr, new_hash); 1307 unix_table_double_unlock(net, old_hash, new_hash); 1308 unix_insert_bsd_socket(sk); 1309 mutex_unlock(&u->bindlock); 1310 done_path_create(&parent, dentry); 1311 return 0; 1312 1313 out_unlock: 1314 mutex_unlock(&u->bindlock); 1315 err = -EINVAL; 1316 out_unlink: 1317 /* failed after successful mknod? unlink what we'd created... */ 1318 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1319 out_path: 1320 done_path_create(&parent, dentry); 1321 out: 1322 unix_release_addr(addr); 1323 return err == -EEXIST ? -EADDRINUSE : err; 1324 } 1325 1326 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1327 int addr_len) 1328 { 1329 struct unix_sock *u = unix_sk(sk); 1330 unsigned int new_hash, old_hash; 1331 struct net *net = sock_net(sk); 1332 struct unix_address *addr; 1333 int err; 1334 1335 addr = unix_create_addr(sunaddr, addr_len); 1336 if (!addr) 1337 return -ENOMEM; 1338 1339 err = mutex_lock_interruptible(&u->bindlock); 1340 if (err) 1341 goto out; 1342 1343 if (u->addr) { 1344 err = -EINVAL; 1345 goto out_mutex; 1346 } 1347 1348 old_hash = sk->sk_hash; 1349 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1350 unix_table_double_lock(net, old_hash, new_hash); 1351 1352 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1353 goto out_spin; 1354 1355 __unix_set_addr_hash(net, sk, addr, new_hash); 1356 unix_table_double_unlock(net, old_hash, new_hash); 1357 mutex_unlock(&u->bindlock); 1358 return 0; 1359 1360 out_spin: 1361 unix_table_double_unlock(net, old_hash, new_hash); 1362 err = -EADDRINUSE; 1363 out_mutex: 1364 mutex_unlock(&u->bindlock); 1365 out: 1366 unix_release_addr(addr); 1367 return err; 1368 } 1369 1370 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1371 { 1372 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1373 struct sock *sk = sock->sk; 1374 int err; 1375 1376 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1377 sunaddr->sun_family == AF_UNIX) 1378 return unix_autobind(sk); 1379 1380 err = unix_validate_addr(sunaddr, addr_len); 1381 if (err) 1382 return err; 1383 1384 if (sunaddr->sun_path[0]) 1385 err = unix_bind_bsd(sk, sunaddr, addr_len); 1386 else 1387 err = unix_bind_abstract(sk, sunaddr, addr_len); 1388 1389 return err; 1390 } 1391 1392 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1393 { 1394 if (unlikely(sk1 == sk2) || !sk2) { 1395 unix_state_lock(sk1); 1396 return; 1397 } 1398 1399 if (sk1 > sk2) 1400 swap(sk1, sk2); 1401 1402 unix_state_lock(sk1); 1403 unix_state_lock(sk2); 1404 } 1405 1406 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1407 { 1408 if (unlikely(sk1 == sk2) || !sk2) { 1409 unix_state_unlock(sk1); 1410 return; 1411 } 1412 unix_state_unlock(sk1); 1413 unix_state_unlock(sk2); 1414 } 1415 1416 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1417 int alen, int flags) 1418 { 1419 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1420 struct sock *sk = sock->sk; 1421 struct sock *other; 1422 int err; 1423 1424 err = -EINVAL; 1425 if (alen < offsetofend(struct sockaddr, sa_family)) 1426 goto out; 1427 1428 if (addr->sa_family != AF_UNSPEC) { 1429 err = unix_validate_addr(sunaddr, alen); 1430 if (err) 1431 goto out; 1432 1433 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1434 if (err) 1435 goto out; 1436 1437 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1438 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1439 !READ_ONCE(unix_sk(sk)->addr)) { 1440 err = unix_autobind(sk); 1441 if (err) 1442 goto out; 1443 } 1444 1445 restart: 1446 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1447 if (IS_ERR(other)) { 1448 err = PTR_ERR(other); 1449 goto out; 1450 } 1451 1452 unix_state_double_lock(sk, other); 1453 1454 /* Apparently VFS overslept socket death. Retry. */ 1455 if (sock_flag(other, SOCK_DEAD)) { 1456 unix_state_double_unlock(sk, other); 1457 sock_put(other); 1458 goto restart; 1459 } 1460 1461 err = -EPERM; 1462 if (!unix_may_send(sk, other)) 1463 goto out_unlock; 1464 1465 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1466 if (err) 1467 goto out_unlock; 1468 1469 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1470 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1471 } else { 1472 /* 1473 * 1003.1g breaking connected state with AF_UNSPEC 1474 */ 1475 other = NULL; 1476 unix_state_double_lock(sk, other); 1477 } 1478 1479 /* 1480 * If it was connected, reconnect. 1481 */ 1482 if (unix_peer(sk)) { 1483 struct sock *old_peer = unix_peer(sk); 1484 1485 unix_peer(sk) = other; 1486 if (!other) 1487 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1488 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1489 1490 unix_state_double_unlock(sk, other); 1491 1492 if (other != old_peer) { 1493 unix_dgram_disconnected(sk, old_peer); 1494 1495 unix_state_lock(old_peer); 1496 if (!unix_peer(old_peer)) 1497 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1498 unix_state_unlock(old_peer); 1499 } 1500 1501 sock_put(old_peer); 1502 } else { 1503 unix_peer(sk) = other; 1504 unix_state_double_unlock(sk, other); 1505 } 1506 1507 return 0; 1508 1509 out_unlock: 1510 unix_state_double_unlock(sk, other); 1511 sock_put(other); 1512 out: 1513 return err; 1514 } 1515 1516 static long unix_wait_for_peer(struct sock *other, long timeo) 1517 __releases(&unix_sk(other)->lock) 1518 { 1519 struct unix_sock *u = unix_sk(other); 1520 int sched; 1521 DEFINE_WAIT(wait); 1522 1523 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1524 1525 sched = !sock_flag(other, SOCK_DEAD) && 1526 !(other->sk_shutdown & RCV_SHUTDOWN) && 1527 unix_recvq_full_lockless(other); 1528 1529 unix_state_unlock(other); 1530 1531 if (sched) 1532 timeo = schedule_timeout(timeo); 1533 1534 finish_wait(&u->peer_wait, &wait); 1535 return timeo; 1536 } 1537 1538 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1539 int addr_len, int flags) 1540 { 1541 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1542 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1543 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1544 struct net *net = sock_net(sk); 1545 struct sk_buff *skb = NULL; 1546 unsigned char state; 1547 long timeo; 1548 int err; 1549 1550 err = unix_validate_addr(sunaddr, addr_len); 1551 if (err) 1552 goto out; 1553 1554 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1555 if (err) 1556 goto out; 1557 1558 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1559 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1560 !READ_ONCE(u->addr)) { 1561 err = unix_autobind(sk); 1562 if (err) 1563 goto out; 1564 } 1565 1566 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1567 1568 /* First of all allocate resources. 1569 If we will make it after state is locked, 1570 we will have to recheck all again in any case. 1571 */ 1572 1573 /* create new sock for complete connection */ 1574 newsk = unix_create1(net, NULL, 0, sock->type); 1575 if (IS_ERR(newsk)) { 1576 err = PTR_ERR(newsk); 1577 newsk = NULL; 1578 goto out; 1579 } 1580 1581 err = -ENOMEM; 1582 1583 /* Allocate skb for sending to listening sock */ 1584 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1585 if (skb == NULL) 1586 goto out; 1587 1588 restart: 1589 /* Find listening sock. */ 1590 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1591 if (IS_ERR(other)) { 1592 err = PTR_ERR(other); 1593 other = NULL; 1594 goto out; 1595 } 1596 1597 unix_state_lock(other); 1598 1599 /* Apparently VFS overslept socket death. Retry. */ 1600 if (sock_flag(other, SOCK_DEAD)) { 1601 unix_state_unlock(other); 1602 sock_put(other); 1603 goto restart; 1604 } 1605 1606 err = -ECONNREFUSED; 1607 if (other->sk_state != TCP_LISTEN) 1608 goto out_unlock; 1609 if (other->sk_shutdown & RCV_SHUTDOWN) 1610 goto out_unlock; 1611 1612 if (unix_recvq_full_lockless(other)) { 1613 err = -EAGAIN; 1614 if (!timeo) 1615 goto out_unlock; 1616 1617 timeo = unix_wait_for_peer(other, timeo); 1618 1619 err = sock_intr_errno(timeo); 1620 if (signal_pending(current)) 1621 goto out; 1622 sock_put(other); 1623 goto restart; 1624 } 1625 1626 /* self connect and simultaneous connect are eliminated 1627 * by rejecting TCP_LISTEN socket to avoid deadlock. 1628 */ 1629 state = READ_ONCE(sk->sk_state); 1630 if (unlikely(state != TCP_CLOSE)) { 1631 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1632 goto out_unlock; 1633 } 1634 1635 unix_state_lock(sk); 1636 1637 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1638 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1639 unix_state_unlock(sk); 1640 goto out_unlock; 1641 } 1642 1643 err = security_unix_stream_connect(sk, other, newsk); 1644 if (err) { 1645 unix_state_unlock(sk); 1646 goto out_unlock; 1647 } 1648 1649 /* The way is open! Fastly set all the necessary fields... */ 1650 1651 sock_hold(sk); 1652 unix_peer(newsk) = sk; 1653 newsk->sk_state = TCP_ESTABLISHED; 1654 newsk->sk_type = sk->sk_type; 1655 init_peercred(newsk); 1656 newu = unix_sk(newsk); 1657 newu->listener = other; 1658 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1659 otheru = unix_sk(other); 1660 1661 /* copy address information from listening to new sock 1662 * 1663 * The contents of *(otheru->addr) and otheru->path 1664 * are seen fully set up here, since we have found 1665 * otheru in hash under its lock. Insertion into the 1666 * hash chain we'd found it in had been done in an 1667 * earlier critical area protected by the chain's lock, 1668 * the same one where we'd set *(otheru->addr) contents, 1669 * as well as otheru->path and otheru->addr itself. 1670 * 1671 * Using smp_store_release() here to set newu->addr 1672 * is enough to make those stores, as well as stores 1673 * to newu->path visible to anyone who gets newu->addr 1674 * by smp_load_acquire(). IOW, the same warranties 1675 * as for unix_sock instances bound in unix_bind() or 1676 * in unix_autobind(). 1677 */ 1678 if (otheru->path.dentry) { 1679 path_get(&otheru->path); 1680 newu->path = otheru->path; 1681 } 1682 refcount_inc(&otheru->addr->refcnt); 1683 smp_store_release(&newu->addr, otheru->addr); 1684 1685 /* Set credentials */ 1686 copy_peercred(sk, other); 1687 1688 sock->state = SS_CONNECTED; 1689 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1690 sock_hold(newsk); 1691 1692 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1693 unix_peer(sk) = newsk; 1694 1695 unix_state_unlock(sk); 1696 1697 /* take ten and send info to listening sock */ 1698 spin_lock(&other->sk_receive_queue.lock); 1699 __skb_queue_tail(&other->sk_receive_queue, skb); 1700 spin_unlock(&other->sk_receive_queue.lock); 1701 unix_state_unlock(other); 1702 other->sk_data_ready(other); 1703 sock_put(other); 1704 return 0; 1705 1706 out_unlock: 1707 if (other) 1708 unix_state_unlock(other); 1709 1710 out: 1711 kfree_skb(skb); 1712 if (newsk) 1713 unix_release_sock(newsk, 0); 1714 if (other) 1715 sock_put(other); 1716 return err; 1717 } 1718 1719 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1720 { 1721 struct sock *ska = socka->sk, *skb = sockb->sk; 1722 1723 /* Join our sockets back to back */ 1724 sock_hold(ska); 1725 sock_hold(skb); 1726 unix_peer(ska) = skb; 1727 unix_peer(skb) = ska; 1728 init_peercred(ska); 1729 init_peercred(skb); 1730 1731 ska->sk_state = TCP_ESTABLISHED; 1732 skb->sk_state = TCP_ESTABLISHED; 1733 socka->state = SS_CONNECTED; 1734 sockb->state = SS_CONNECTED; 1735 return 0; 1736 } 1737 1738 static void unix_sock_inherit_flags(const struct socket *old, 1739 struct socket *new) 1740 { 1741 if (test_bit(SOCK_PASSCRED, &old->flags)) 1742 set_bit(SOCK_PASSCRED, &new->flags); 1743 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1744 set_bit(SOCK_PASSPIDFD, &new->flags); 1745 if (test_bit(SOCK_PASSSEC, &old->flags)) 1746 set_bit(SOCK_PASSSEC, &new->flags); 1747 } 1748 1749 static int unix_accept(struct socket *sock, struct socket *newsock, 1750 struct proto_accept_arg *arg) 1751 { 1752 struct sock *sk = sock->sk; 1753 struct sk_buff *skb; 1754 struct sock *tsk; 1755 1756 arg->err = -EOPNOTSUPP; 1757 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1758 goto out; 1759 1760 arg->err = -EINVAL; 1761 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1762 goto out; 1763 1764 /* If socket state is TCP_LISTEN it cannot change (for now...), 1765 * so that no locks are necessary. 1766 */ 1767 1768 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1769 &arg->err); 1770 if (!skb) { 1771 /* This means receive shutdown. */ 1772 if (arg->err == 0) 1773 arg->err = -EINVAL; 1774 goto out; 1775 } 1776 1777 tsk = skb->sk; 1778 skb_free_datagram(sk, skb); 1779 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1780 1781 /* attach accepted sock to socket */ 1782 unix_state_lock(tsk); 1783 unix_update_edges(unix_sk(tsk)); 1784 newsock->state = SS_CONNECTED; 1785 unix_sock_inherit_flags(sock, newsock); 1786 sock_graft(tsk, newsock); 1787 unix_state_unlock(tsk); 1788 return 0; 1789 1790 out: 1791 return arg->err; 1792 } 1793 1794 1795 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1796 { 1797 struct sock *sk = sock->sk; 1798 struct unix_address *addr; 1799 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1800 int err = 0; 1801 1802 if (peer) { 1803 sk = unix_peer_get(sk); 1804 1805 err = -ENOTCONN; 1806 if (!sk) 1807 goto out; 1808 err = 0; 1809 } else { 1810 sock_hold(sk); 1811 } 1812 1813 addr = smp_load_acquire(&unix_sk(sk)->addr); 1814 if (!addr) { 1815 sunaddr->sun_family = AF_UNIX; 1816 sunaddr->sun_path[0] = 0; 1817 err = offsetof(struct sockaddr_un, sun_path); 1818 } else { 1819 err = addr->len; 1820 memcpy(sunaddr, addr->name, addr->len); 1821 1822 if (peer) 1823 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1824 CGROUP_UNIX_GETPEERNAME); 1825 else 1826 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1827 CGROUP_UNIX_GETSOCKNAME); 1828 } 1829 sock_put(sk); 1830 out: 1831 return err; 1832 } 1833 1834 /* The "user->unix_inflight" variable is protected by the garbage 1835 * collection lock, and we just read it locklessly here. If you go 1836 * over the limit, there might be a tiny race in actually noticing 1837 * it across threads. Tough. 1838 */ 1839 static inline bool too_many_unix_fds(struct task_struct *p) 1840 { 1841 struct user_struct *user = current_user(); 1842 1843 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1844 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1845 return false; 1846 } 1847 1848 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1849 { 1850 if (too_many_unix_fds(current)) 1851 return -ETOOMANYREFS; 1852 1853 UNIXCB(skb).fp = scm->fp; 1854 scm->fp = NULL; 1855 1856 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1857 return -ENOMEM; 1858 1859 return 0; 1860 } 1861 1862 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1863 { 1864 scm->fp = UNIXCB(skb).fp; 1865 UNIXCB(skb).fp = NULL; 1866 1867 unix_destroy_fpl(scm->fp); 1868 } 1869 1870 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1871 { 1872 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1873 } 1874 1875 static void unix_destruct_scm(struct sk_buff *skb) 1876 { 1877 struct scm_cookie scm; 1878 1879 memset(&scm, 0, sizeof(scm)); 1880 scm.pid = UNIXCB(skb).pid; 1881 if (UNIXCB(skb).fp) 1882 unix_detach_fds(&scm, skb); 1883 1884 /* Alas, it calls VFS */ 1885 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1886 scm_destroy(&scm); 1887 sock_wfree(skb); 1888 } 1889 1890 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1891 { 1892 int err = 0; 1893 1894 UNIXCB(skb).pid = get_pid(scm->pid); 1895 UNIXCB(skb).uid = scm->creds.uid; 1896 UNIXCB(skb).gid = scm->creds.gid; 1897 UNIXCB(skb).fp = NULL; 1898 unix_get_secdata(scm, skb); 1899 if (scm->fp && send_fds) 1900 err = unix_attach_fds(scm, skb); 1901 1902 skb->destructor = unix_destruct_scm; 1903 return err; 1904 } 1905 1906 static bool unix_passcred_enabled(const struct socket *sock, 1907 const struct sock *other) 1908 { 1909 return test_bit(SOCK_PASSCRED, &sock->flags) || 1910 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1911 !other->sk_socket || 1912 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1913 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1914 } 1915 1916 /* 1917 * Some apps rely on write() giving SCM_CREDENTIALS 1918 * We include credentials if source or destination socket 1919 * asserted SOCK_PASSCRED. 1920 */ 1921 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1922 const struct sock *other) 1923 { 1924 if (UNIXCB(skb).pid) 1925 return; 1926 if (unix_passcred_enabled(sock, other)) { 1927 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1928 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1929 } 1930 } 1931 1932 static bool unix_skb_scm_eq(struct sk_buff *skb, 1933 struct scm_cookie *scm) 1934 { 1935 return UNIXCB(skb).pid == scm->pid && 1936 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1937 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1938 unix_secdata_eq(scm, skb); 1939 } 1940 1941 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1942 { 1943 struct scm_fp_list *fp = UNIXCB(skb).fp; 1944 struct unix_sock *u = unix_sk(sk); 1945 1946 if (unlikely(fp && fp->count)) { 1947 atomic_add(fp->count, &u->scm_stat.nr_fds); 1948 unix_add_edges(fp, u); 1949 } 1950 } 1951 1952 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1953 { 1954 struct scm_fp_list *fp = UNIXCB(skb).fp; 1955 struct unix_sock *u = unix_sk(sk); 1956 1957 if (unlikely(fp && fp->count)) { 1958 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1959 unix_del_edges(fp); 1960 } 1961 } 1962 1963 /* 1964 * Send AF_UNIX data. 1965 */ 1966 1967 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1968 size_t len) 1969 { 1970 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1971 struct sock *sk = sock->sk, *other = NULL; 1972 struct unix_sock *u = unix_sk(sk); 1973 struct scm_cookie scm; 1974 struct sk_buff *skb; 1975 int data_len = 0; 1976 int sk_locked; 1977 long timeo; 1978 int err; 1979 1980 err = scm_send(sock, msg, &scm, false); 1981 if (err < 0) 1982 return err; 1983 1984 wait_for_unix_gc(scm.fp); 1985 1986 err = -EOPNOTSUPP; 1987 if (msg->msg_flags&MSG_OOB) 1988 goto out; 1989 1990 if (msg->msg_namelen) { 1991 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1992 if (err) 1993 goto out; 1994 1995 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1996 msg->msg_name, 1997 &msg->msg_namelen, 1998 NULL); 1999 if (err) 2000 goto out; 2001 } else { 2002 sunaddr = NULL; 2003 err = -ENOTCONN; 2004 other = unix_peer_get(sk); 2005 if (!other) 2006 goto out; 2007 } 2008 2009 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 2010 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 2011 !READ_ONCE(u->addr)) { 2012 err = unix_autobind(sk); 2013 if (err) 2014 goto out; 2015 } 2016 2017 err = -EMSGSIZE; 2018 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 2019 goto out; 2020 2021 if (len > SKB_MAX_ALLOC) { 2022 data_len = min_t(size_t, 2023 len - SKB_MAX_ALLOC, 2024 MAX_SKB_FRAGS * PAGE_SIZE); 2025 data_len = PAGE_ALIGN(data_len); 2026 2027 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 2028 } 2029 2030 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2031 msg->msg_flags & MSG_DONTWAIT, &err, 2032 PAGE_ALLOC_COSTLY_ORDER); 2033 if (skb == NULL) 2034 goto out; 2035 2036 err = unix_scm_to_skb(&scm, skb, true); 2037 if (err < 0) 2038 goto out_free; 2039 2040 skb_put(skb, len - data_len); 2041 skb->data_len = data_len; 2042 skb->len = len; 2043 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2044 if (err) 2045 goto out_free; 2046 2047 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2048 2049 restart: 2050 if (!other) { 2051 err = -ECONNRESET; 2052 if (sunaddr == NULL) 2053 goto out_free; 2054 2055 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 2056 sk->sk_type); 2057 if (IS_ERR(other)) { 2058 err = PTR_ERR(other); 2059 other = NULL; 2060 goto out_free; 2061 } 2062 } 2063 2064 if (sk_filter(other, skb) < 0) { 2065 /* Toss the packet but do not return any error to the sender */ 2066 err = len; 2067 goto out_free; 2068 } 2069 2070 sk_locked = 0; 2071 unix_state_lock(other); 2072 restart_locked: 2073 err = -EPERM; 2074 if (!unix_may_send(sk, other)) 2075 goto out_unlock; 2076 2077 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2078 /* 2079 * Check with 1003.1g - what should 2080 * datagram error 2081 */ 2082 unix_state_unlock(other); 2083 sock_put(other); 2084 2085 if (!sk_locked) 2086 unix_state_lock(sk); 2087 2088 err = 0; 2089 if (sk->sk_type == SOCK_SEQPACKET) { 2090 /* We are here only when racing with unix_release_sock() 2091 * is clearing @other. Never change state to TCP_CLOSE 2092 * unlike SOCK_DGRAM wants. 2093 */ 2094 unix_state_unlock(sk); 2095 err = -EPIPE; 2096 } else if (unix_peer(sk) == other) { 2097 unix_peer(sk) = NULL; 2098 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2099 2100 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2101 unix_state_unlock(sk); 2102 2103 unix_dgram_disconnected(sk, other); 2104 sock_put(other); 2105 err = -ECONNREFUSED; 2106 } else { 2107 unix_state_unlock(sk); 2108 } 2109 2110 other = NULL; 2111 if (err) 2112 goto out_free; 2113 goto restart; 2114 } 2115 2116 err = -EPIPE; 2117 if (other->sk_shutdown & RCV_SHUTDOWN) 2118 goto out_unlock; 2119 2120 if (sk->sk_type != SOCK_SEQPACKET) { 2121 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2122 if (err) 2123 goto out_unlock; 2124 } 2125 2126 /* other == sk && unix_peer(other) != sk if 2127 * - unix_peer(sk) == NULL, destination address bound to sk 2128 * - unix_peer(sk) == sk by time of get but disconnected before lock 2129 */ 2130 if (other != sk && 2131 unlikely(unix_peer(other) != sk && 2132 unix_recvq_full_lockless(other))) { 2133 if (timeo) { 2134 timeo = unix_wait_for_peer(other, timeo); 2135 2136 err = sock_intr_errno(timeo); 2137 if (signal_pending(current)) 2138 goto out_free; 2139 2140 goto restart; 2141 } 2142 2143 if (!sk_locked) { 2144 unix_state_unlock(other); 2145 unix_state_double_lock(sk, other); 2146 } 2147 2148 if (unix_peer(sk) != other || 2149 unix_dgram_peer_wake_me(sk, other)) { 2150 err = -EAGAIN; 2151 sk_locked = 1; 2152 goto out_unlock; 2153 } 2154 2155 if (!sk_locked) { 2156 sk_locked = 1; 2157 goto restart_locked; 2158 } 2159 } 2160 2161 if (unlikely(sk_locked)) 2162 unix_state_unlock(sk); 2163 2164 if (sock_flag(other, SOCK_RCVTSTAMP)) 2165 __net_timestamp(skb); 2166 maybe_add_creds(skb, sock, other); 2167 scm_stat_add(other, skb); 2168 skb_queue_tail(&other->sk_receive_queue, skb); 2169 unix_state_unlock(other); 2170 other->sk_data_ready(other); 2171 sock_put(other); 2172 scm_destroy(&scm); 2173 return len; 2174 2175 out_unlock: 2176 if (sk_locked) 2177 unix_state_unlock(sk); 2178 unix_state_unlock(other); 2179 out_free: 2180 kfree_skb(skb); 2181 out: 2182 if (other) 2183 sock_put(other); 2184 scm_destroy(&scm); 2185 return err; 2186 } 2187 2188 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2189 * bytes, and a minimum of a full page. 2190 */ 2191 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2192 2193 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2194 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2195 struct scm_cookie *scm, bool fds_sent) 2196 { 2197 struct unix_sock *ousk = unix_sk(other); 2198 struct sk_buff *skb; 2199 int err = 0; 2200 2201 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2202 2203 if (!skb) 2204 return err; 2205 2206 err = unix_scm_to_skb(scm, skb, !fds_sent); 2207 if (err < 0) { 2208 kfree_skb(skb); 2209 return err; 2210 } 2211 skb_put(skb, 1); 2212 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2213 2214 if (err) { 2215 kfree_skb(skb); 2216 return err; 2217 } 2218 2219 unix_state_lock(other); 2220 2221 if (sock_flag(other, SOCK_DEAD) || 2222 (other->sk_shutdown & RCV_SHUTDOWN)) { 2223 unix_state_unlock(other); 2224 kfree_skb(skb); 2225 return -EPIPE; 2226 } 2227 2228 maybe_add_creds(skb, sock, other); 2229 skb_get(skb); 2230 2231 scm_stat_add(other, skb); 2232 2233 spin_lock(&other->sk_receive_queue.lock); 2234 if (ousk->oob_skb) 2235 consume_skb(ousk->oob_skb); 2236 WRITE_ONCE(ousk->oob_skb, skb); 2237 __skb_queue_tail(&other->sk_receive_queue, skb); 2238 spin_unlock(&other->sk_receive_queue.lock); 2239 2240 sk_send_sigurg(other); 2241 unix_state_unlock(other); 2242 other->sk_data_ready(other); 2243 2244 return err; 2245 } 2246 #endif 2247 2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2249 size_t len) 2250 { 2251 struct sock *sk = sock->sk; 2252 struct sock *other = NULL; 2253 int err, size; 2254 struct sk_buff *skb; 2255 int sent = 0; 2256 struct scm_cookie scm; 2257 bool fds_sent = false; 2258 int data_len; 2259 2260 err = scm_send(sock, msg, &scm, false); 2261 if (err < 0) 2262 return err; 2263 2264 wait_for_unix_gc(scm.fp); 2265 2266 err = -EOPNOTSUPP; 2267 if (msg->msg_flags & MSG_OOB) { 2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2269 if (len) 2270 len--; 2271 else 2272 #endif 2273 goto out_err; 2274 } 2275 2276 if (msg->msg_namelen) { 2277 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2278 goto out_err; 2279 } else { 2280 err = -ENOTCONN; 2281 other = unix_peer(sk); 2282 if (!other) 2283 goto out_err; 2284 } 2285 2286 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2287 goto pipe_err; 2288 2289 while (sent < len) { 2290 size = len - sent; 2291 2292 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2293 skb = sock_alloc_send_pskb(sk, 0, 0, 2294 msg->msg_flags & MSG_DONTWAIT, 2295 &err, 0); 2296 } else { 2297 /* Keep two messages in the pipe so it schedules better */ 2298 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2299 2300 /* allow fallback to order-0 allocations */ 2301 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2302 2303 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2304 2305 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2306 2307 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2308 msg->msg_flags & MSG_DONTWAIT, &err, 2309 get_order(UNIX_SKB_FRAGS_SZ)); 2310 } 2311 if (!skb) 2312 goto out_err; 2313 2314 /* Only send the fds in the first buffer */ 2315 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2316 if (err < 0) { 2317 kfree_skb(skb); 2318 goto out_err; 2319 } 2320 fds_sent = true; 2321 2322 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2323 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2324 sk->sk_allocation); 2325 if (err < 0) { 2326 kfree_skb(skb); 2327 goto out_err; 2328 } 2329 size = err; 2330 refcount_add(size, &sk->sk_wmem_alloc); 2331 } else { 2332 skb_put(skb, size - data_len); 2333 skb->data_len = data_len; 2334 skb->len = size; 2335 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2336 if (err) { 2337 kfree_skb(skb); 2338 goto out_err; 2339 } 2340 } 2341 2342 unix_state_lock(other); 2343 2344 if (sock_flag(other, SOCK_DEAD) || 2345 (other->sk_shutdown & RCV_SHUTDOWN)) 2346 goto pipe_err_free; 2347 2348 maybe_add_creds(skb, sock, other); 2349 scm_stat_add(other, skb); 2350 skb_queue_tail(&other->sk_receive_queue, skb); 2351 unix_state_unlock(other); 2352 other->sk_data_ready(other); 2353 sent += size; 2354 } 2355 2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2357 if (msg->msg_flags & MSG_OOB) { 2358 err = queue_oob(sock, msg, other, &scm, fds_sent); 2359 if (err) 2360 goto out_err; 2361 sent++; 2362 } 2363 #endif 2364 2365 scm_destroy(&scm); 2366 2367 return sent; 2368 2369 pipe_err_free: 2370 unix_state_unlock(other); 2371 kfree_skb(skb); 2372 pipe_err: 2373 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2374 send_sig(SIGPIPE, current, 0); 2375 err = -EPIPE; 2376 out_err: 2377 scm_destroy(&scm); 2378 return sent ? : err; 2379 } 2380 2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2382 size_t len) 2383 { 2384 int err; 2385 struct sock *sk = sock->sk; 2386 2387 err = sock_error(sk); 2388 if (err) 2389 return err; 2390 2391 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2392 return -ENOTCONN; 2393 2394 if (msg->msg_namelen) 2395 msg->msg_namelen = 0; 2396 2397 return unix_dgram_sendmsg(sock, msg, len); 2398 } 2399 2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2401 size_t size, int flags) 2402 { 2403 struct sock *sk = sock->sk; 2404 2405 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2406 return -ENOTCONN; 2407 2408 return unix_dgram_recvmsg(sock, msg, size, flags); 2409 } 2410 2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2412 { 2413 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2414 2415 if (addr) { 2416 msg->msg_namelen = addr->len; 2417 memcpy(msg->msg_name, addr->name, addr->len); 2418 } 2419 } 2420 2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2422 int flags) 2423 { 2424 struct scm_cookie scm; 2425 struct socket *sock = sk->sk_socket; 2426 struct unix_sock *u = unix_sk(sk); 2427 struct sk_buff *skb, *last; 2428 long timeo; 2429 int skip; 2430 int err; 2431 2432 err = -EOPNOTSUPP; 2433 if (flags&MSG_OOB) 2434 goto out; 2435 2436 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2437 2438 do { 2439 mutex_lock(&u->iolock); 2440 2441 skip = sk_peek_offset(sk, flags); 2442 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2443 &skip, &err, &last); 2444 if (skb) { 2445 if (!(flags & MSG_PEEK)) 2446 scm_stat_del(sk, skb); 2447 break; 2448 } 2449 2450 mutex_unlock(&u->iolock); 2451 2452 if (err != -EAGAIN) 2453 break; 2454 } while (timeo && 2455 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2456 &err, &timeo, last)); 2457 2458 if (!skb) { /* implies iolock unlocked */ 2459 unix_state_lock(sk); 2460 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2461 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2462 (sk->sk_shutdown & RCV_SHUTDOWN)) 2463 err = 0; 2464 unix_state_unlock(sk); 2465 goto out; 2466 } 2467 2468 if (wq_has_sleeper(&u->peer_wait)) 2469 wake_up_interruptible_sync_poll(&u->peer_wait, 2470 EPOLLOUT | EPOLLWRNORM | 2471 EPOLLWRBAND); 2472 2473 if (ccs_socket_post_recvmsg_permission(sk, skb, flags)) { 2474 err = -EAGAIN; /* Hope less harmful than -EPERM. */ 2475 goto out_unlock; 2476 } 2477 if (msg->msg_name) { 2478 unix_copy_addr(msg, skb->sk); 2479 2480 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2481 msg->msg_name, 2482 &msg->msg_namelen); 2483 } 2484 2485 if (size > skb->len - skip) 2486 size = skb->len - skip; 2487 else if (size < skb->len - skip) 2488 msg->msg_flags |= MSG_TRUNC; 2489 2490 err = skb_copy_datagram_msg(skb, skip, msg, size); 2491 if (err) 2492 goto out_free; 2493 2494 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2495 __sock_recv_timestamp(msg, sk, skb); 2496 2497 memset(&scm, 0, sizeof(scm)); 2498 2499 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2500 unix_set_secdata(&scm, skb); 2501 2502 if (!(flags & MSG_PEEK)) { 2503 if (UNIXCB(skb).fp) 2504 unix_detach_fds(&scm, skb); 2505 2506 sk_peek_offset_bwd(sk, skb->len); 2507 } else { 2508 /* It is questionable: on PEEK we could: 2509 - do not return fds - good, but too simple 8) 2510 - return fds, and do not return them on read (old strategy, 2511 apparently wrong) 2512 - clone fds (I chose it for now, it is the most universal 2513 solution) 2514 2515 POSIX 1003.1g does not actually define this clearly 2516 at all. POSIX 1003.1g doesn't define a lot of things 2517 clearly however! 2518 2519 */ 2520 2521 sk_peek_offset_fwd(sk, size); 2522 2523 if (UNIXCB(skb).fp) 2524 unix_peek_fds(&scm, skb); 2525 } 2526 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2527 2528 scm_recv_unix(sock, msg, &scm, flags); 2529 2530 out_free: 2531 skb_free_datagram(sk, skb); 2532 out_unlock: 2533 mutex_unlock(&u->iolock); 2534 out: 2535 return err; 2536 } 2537 2538 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2539 int flags) 2540 { 2541 struct sock *sk = sock->sk; 2542 2543 #ifdef CONFIG_BPF_SYSCALL 2544 const struct proto *prot = READ_ONCE(sk->sk_prot); 2545 2546 if (prot != &unix_dgram_proto) 2547 return prot->recvmsg(sk, msg, size, flags, NULL); 2548 #endif 2549 return __unix_dgram_recvmsg(sk, msg, size, flags); 2550 } 2551 2552 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2553 { 2554 struct unix_sock *u = unix_sk(sk); 2555 struct sk_buff *skb; 2556 int err; 2557 2558 mutex_lock(&u->iolock); 2559 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2560 mutex_unlock(&u->iolock); 2561 if (!skb) 2562 return err; 2563 2564 return recv_actor(sk, skb); 2565 } 2566 2567 /* 2568 * Sleep until more data has arrived. But check for races.. 2569 */ 2570 static long unix_stream_data_wait(struct sock *sk, long timeo, 2571 struct sk_buff *last, unsigned int last_len, 2572 bool freezable) 2573 { 2574 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2575 struct sk_buff *tail; 2576 DEFINE_WAIT(wait); 2577 2578 unix_state_lock(sk); 2579 2580 for (;;) { 2581 prepare_to_wait(sk_sleep(sk), &wait, state); 2582 2583 tail = skb_peek_tail(&sk->sk_receive_queue); 2584 if (tail != last || 2585 (tail && tail->len != last_len) || 2586 sk->sk_err || 2587 (sk->sk_shutdown & RCV_SHUTDOWN) || 2588 signal_pending(current) || 2589 !timeo) 2590 break; 2591 2592 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2593 unix_state_unlock(sk); 2594 timeo = schedule_timeout(timeo); 2595 unix_state_lock(sk); 2596 2597 if (sock_flag(sk, SOCK_DEAD)) 2598 break; 2599 2600 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2601 } 2602 2603 finish_wait(sk_sleep(sk), &wait); 2604 unix_state_unlock(sk); 2605 return timeo; 2606 } 2607 2608 static unsigned int unix_skb_len(const struct sk_buff *skb) 2609 { 2610 return skb->len - UNIXCB(skb).consumed; 2611 } 2612 2613 struct unix_stream_read_state { 2614 int (*recv_actor)(struct sk_buff *, int, int, 2615 struct unix_stream_read_state *); 2616 struct socket *socket; 2617 struct msghdr *msg; 2618 struct pipe_inode_info *pipe; 2619 size_t size; 2620 int flags; 2621 unsigned int splice_flags; 2622 }; 2623 2624 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2625 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2626 { 2627 struct socket *sock = state->socket; 2628 struct sock *sk = sock->sk; 2629 struct unix_sock *u = unix_sk(sk); 2630 int chunk = 1; 2631 struct sk_buff *oob_skb; 2632 2633 mutex_lock(&u->iolock); 2634 unix_state_lock(sk); 2635 spin_lock(&sk->sk_receive_queue.lock); 2636 2637 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2638 spin_unlock(&sk->sk_receive_queue.lock); 2639 unix_state_unlock(sk); 2640 mutex_unlock(&u->iolock); 2641 return -EINVAL; 2642 } 2643 2644 oob_skb = u->oob_skb; 2645 2646 if (!(state->flags & MSG_PEEK)) 2647 WRITE_ONCE(u->oob_skb, NULL); 2648 else 2649 skb_get(oob_skb); 2650 2651 spin_unlock(&sk->sk_receive_queue.lock); 2652 unix_state_unlock(sk); 2653 2654 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2655 2656 if (!(state->flags & MSG_PEEK)) 2657 UNIXCB(oob_skb).consumed += 1; 2658 2659 consume_skb(oob_skb); 2660 2661 mutex_unlock(&u->iolock); 2662 2663 if (chunk < 0) 2664 return -EFAULT; 2665 2666 state->msg->msg_flags |= MSG_OOB; 2667 return 1; 2668 } 2669 2670 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2671 int flags, int copied) 2672 { 2673 struct unix_sock *u = unix_sk(sk); 2674 2675 if (!unix_skb_len(skb)) { 2676 struct sk_buff *unlinked_skb = NULL; 2677 2678 spin_lock(&sk->sk_receive_queue.lock); 2679 2680 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2681 skb = NULL; 2682 } else if (flags & MSG_PEEK) { 2683 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2684 } else { 2685 unlinked_skb = skb; 2686 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2687 __skb_unlink(unlinked_skb, &sk->sk_receive_queue); 2688 } 2689 2690 spin_unlock(&sk->sk_receive_queue.lock); 2691 2692 consume_skb(unlinked_skb); 2693 } else { 2694 struct sk_buff *unlinked_skb = NULL; 2695 2696 spin_lock(&sk->sk_receive_queue.lock); 2697 2698 if (skb == u->oob_skb) { 2699 if (copied) { 2700 skb = NULL; 2701 } else if (!(flags & MSG_PEEK)) { 2702 if (sock_flag(sk, SOCK_URGINLINE)) { 2703 WRITE_ONCE(u->oob_skb, NULL); 2704 consume_skb(skb); 2705 } else { 2706 __skb_unlink(skb, &sk->sk_receive_queue); 2707 WRITE_ONCE(u->oob_skb, NULL); 2708 unlinked_skb = skb; 2709 skb = skb_peek(&sk->sk_receive_queue); 2710 } 2711 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2712 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2713 } 2714 } 2715 2716 spin_unlock(&sk->sk_receive_queue.lock); 2717 2718 if (unlinked_skb) { 2719 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2720 kfree_skb(unlinked_skb); 2721 } 2722 } 2723 return skb; 2724 } 2725 #endif 2726 2727 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2728 { 2729 struct unix_sock *u = unix_sk(sk); 2730 struct sk_buff *skb; 2731 int err; 2732 2733 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2734 return -ENOTCONN; 2735 2736 mutex_lock(&u->iolock); 2737 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2738 mutex_unlock(&u->iolock); 2739 if (!skb) 2740 return err; 2741 2742 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2743 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2744 bool drop = false; 2745 2746 unix_state_lock(sk); 2747 2748 if (sock_flag(sk, SOCK_DEAD)) { 2749 unix_state_unlock(sk); 2750 kfree_skb(skb); 2751 return -ECONNRESET; 2752 } 2753 2754 spin_lock(&sk->sk_receive_queue.lock); 2755 if (likely(skb == u->oob_skb)) { 2756 WRITE_ONCE(u->oob_skb, NULL); 2757 drop = true; 2758 } 2759 spin_unlock(&sk->sk_receive_queue.lock); 2760 2761 unix_state_unlock(sk); 2762 2763 if (drop) { 2764 WARN_ON_ONCE(skb_unref(skb)); 2765 kfree_skb(skb); 2766 return -EAGAIN; 2767 } 2768 } 2769 #endif 2770 2771 return recv_actor(sk, skb); 2772 } 2773 2774 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2775 bool freezable) 2776 { 2777 struct scm_cookie scm; 2778 struct socket *sock = state->socket; 2779 struct sock *sk = sock->sk; 2780 struct unix_sock *u = unix_sk(sk); 2781 int copied = 0; 2782 int flags = state->flags; 2783 int noblock = flags & MSG_DONTWAIT; 2784 bool check_creds = false; 2785 int target; 2786 int err = 0; 2787 long timeo; 2788 int skip; 2789 size_t size = state->size; 2790 unsigned int last_len; 2791 2792 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2793 err = -EINVAL; 2794 goto out; 2795 } 2796 2797 if (unlikely(flags & MSG_OOB)) { 2798 err = -EOPNOTSUPP; 2799 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2800 err = unix_stream_recv_urg(state); 2801 #endif 2802 goto out; 2803 } 2804 2805 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2806 timeo = sock_rcvtimeo(sk, noblock); 2807 2808 memset(&scm, 0, sizeof(scm)); 2809 2810 /* Lock the socket to prevent queue disordering 2811 * while sleeps in memcpy_tomsg 2812 */ 2813 mutex_lock(&u->iolock); 2814 2815 skip = max(sk_peek_offset(sk, flags), 0); 2816 2817 do { 2818 struct sk_buff *skb, *last; 2819 int chunk; 2820 2821 redo: 2822 unix_state_lock(sk); 2823 if (sock_flag(sk, SOCK_DEAD)) { 2824 err = -ECONNRESET; 2825 goto unlock; 2826 } 2827 last = skb = skb_peek(&sk->sk_receive_queue); 2828 last_len = last ? last->len : 0; 2829 2830 again: 2831 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2832 if (skb) { 2833 skb = manage_oob(skb, sk, flags, copied); 2834 if (!skb && copied) { 2835 unix_state_unlock(sk); 2836 break; 2837 } 2838 } 2839 #endif 2840 if (skb == NULL) { 2841 if (copied >= target) 2842 goto unlock; 2843 2844 /* 2845 * POSIX 1003.1g mandates this order. 2846 */ 2847 2848 err = sock_error(sk); 2849 if (err) 2850 goto unlock; 2851 if (sk->sk_shutdown & RCV_SHUTDOWN) 2852 goto unlock; 2853 2854 unix_state_unlock(sk); 2855 if (!timeo) { 2856 err = -EAGAIN; 2857 break; 2858 } 2859 2860 mutex_unlock(&u->iolock); 2861 2862 timeo = unix_stream_data_wait(sk, timeo, last, 2863 last_len, freezable); 2864 2865 if (signal_pending(current)) { 2866 err = sock_intr_errno(timeo); 2867 scm_destroy(&scm); 2868 goto out; 2869 } 2870 2871 mutex_lock(&u->iolock); 2872 goto redo; 2873 unlock: 2874 unix_state_unlock(sk); 2875 break; 2876 } 2877 2878 while (skip >= unix_skb_len(skb)) { 2879 skip -= unix_skb_len(skb); 2880 last = skb; 2881 last_len = skb->len; 2882 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2883 if (!skb) 2884 goto again; 2885 } 2886 2887 unix_state_unlock(sk); 2888 2889 if (check_creds) { 2890 /* Never glue messages from different writers */ 2891 if (!unix_skb_scm_eq(skb, &scm)) 2892 break; 2893 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2894 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2895 /* Copy credentials */ 2896 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2897 unix_set_secdata(&scm, skb); 2898 check_creds = true; 2899 } 2900 2901 /* Copy address just once */ 2902 if (state->msg && state->msg->msg_name) { 2903 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2904 state->msg->msg_name); 2905 unix_copy_addr(state->msg, skb->sk); 2906 2907 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2908 state->msg->msg_name, 2909 &state->msg->msg_namelen); 2910 2911 sunaddr = NULL; 2912 } 2913 2914 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2915 chunk = state->recv_actor(skb, skip, chunk, state); 2916 if (chunk < 0) { 2917 if (copied == 0) 2918 copied = -EFAULT; 2919 break; 2920 } 2921 copied += chunk; 2922 size -= chunk; 2923 2924 /* Mark read part of skb as used */ 2925 if (!(flags & MSG_PEEK)) { 2926 UNIXCB(skb).consumed += chunk; 2927 2928 sk_peek_offset_bwd(sk, chunk); 2929 2930 if (UNIXCB(skb).fp) { 2931 scm_stat_del(sk, skb); 2932 unix_detach_fds(&scm, skb); 2933 } 2934 2935 if (unix_skb_len(skb)) 2936 break; 2937 2938 skb_unlink(skb, &sk->sk_receive_queue); 2939 consume_skb(skb); 2940 2941 if (scm.fp) 2942 break; 2943 } else { 2944 /* It is questionable, see note in unix_dgram_recvmsg. 2945 */ 2946 if (UNIXCB(skb).fp) 2947 unix_peek_fds(&scm, skb); 2948 2949 sk_peek_offset_fwd(sk, chunk); 2950 2951 if (UNIXCB(skb).fp) 2952 break; 2953 2954 skip = 0; 2955 last = skb; 2956 last_len = skb->len; 2957 unix_state_lock(sk); 2958 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2959 if (skb) 2960 goto again; 2961 unix_state_unlock(sk); 2962 break; 2963 } 2964 } while (size); 2965 2966 mutex_unlock(&u->iolock); 2967 if (state->msg) 2968 scm_recv_unix(sock, state->msg, &scm, flags); 2969 else 2970 scm_destroy(&scm); 2971 out: 2972 return copied ? : err; 2973 } 2974 2975 static int unix_stream_read_actor(struct sk_buff *skb, 2976 int skip, int chunk, 2977 struct unix_stream_read_state *state) 2978 { 2979 int ret; 2980 2981 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2982 state->msg, chunk); 2983 return ret ?: chunk; 2984 } 2985 2986 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2987 size_t size, int flags) 2988 { 2989 struct unix_stream_read_state state = { 2990 .recv_actor = unix_stream_read_actor, 2991 .socket = sk->sk_socket, 2992 .msg = msg, 2993 .size = size, 2994 .flags = flags 2995 }; 2996 2997 return unix_stream_read_generic(&state, true); 2998 } 2999 3000 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 3001 size_t size, int flags) 3002 { 3003 struct unix_stream_read_state state = { 3004 .recv_actor = unix_stream_read_actor, 3005 .socket = sock, 3006 .msg = msg, 3007 .size = size, 3008 .flags = flags 3009 }; 3010 3011 #ifdef CONFIG_BPF_SYSCALL 3012 struct sock *sk = sock->sk; 3013 const struct proto *prot = READ_ONCE(sk->sk_prot); 3014 3015 if (prot != &unix_stream_proto) 3016 return prot->recvmsg(sk, msg, size, flags, NULL); 3017 #endif 3018 return unix_stream_read_generic(&state, true); 3019 } 3020 3021 static int unix_stream_splice_actor(struct sk_buff *skb, 3022 int skip, int chunk, 3023 struct unix_stream_read_state *state) 3024 { 3025 return skb_splice_bits(skb, state->socket->sk, 3026 UNIXCB(skb).consumed + skip, 3027 state->pipe, chunk, state->splice_flags); 3028 } 3029 3030 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 3031 struct pipe_inode_info *pipe, 3032 size_t size, unsigned int flags) 3033 { 3034 struct unix_stream_read_state state = { 3035 .recv_actor = unix_stream_splice_actor, 3036 .socket = sock, 3037 .pipe = pipe, 3038 .size = size, 3039 .splice_flags = flags, 3040 }; 3041 3042 if (unlikely(*ppos)) 3043 return -ESPIPE; 3044 3045 if (sock->file->f_flags & O_NONBLOCK || 3046 flags & SPLICE_F_NONBLOCK) 3047 state.flags = MSG_DONTWAIT; 3048 3049 return unix_stream_read_generic(&state, false); 3050 } 3051 3052 static int unix_shutdown(struct socket *sock, int mode) 3053 { 3054 struct sock *sk = sock->sk; 3055 struct sock *other; 3056 3057 if (mode < SHUT_RD || mode > SHUT_RDWR) 3058 return -EINVAL; 3059 /* This maps: 3060 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3061 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3062 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3063 */ 3064 ++mode; 3065 3066 unix_state_lock(sk); 3067 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3068 other = unix_peer(sk); 3069 if (other) 3070 sock_hold(other); 3071 unix_state_unlock(sk); 3072 sk->sk_state_change(sk); 3073 3074 if (other && 3075 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3076 3077 int peer_mode = 0; 3078 const struct proto *prot = READ_ONCE(other->sk_prot); 3079 3080 if (prot->unhash) 3081 prot->unhash(other); 3082 if (mode&RCV_SHUTDOWN) 3083 peer_mode |= SEND_SHUTDOWN; 3084 if (mode&SEND_SHUTDOWN) 3085 peer_mode |= RCV_SHUTDOWN; 3086 unix_state_lock(other); 3087 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3088 unix_state_unlock(other); 3089 other->sk_state_change(other); 3090 if (peer_mode == SHUTDOWN_MASK) 3091 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3092 else if (peer_mode & RCV_SHUTDOWN) 3093 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3094 } 3095 if (other) 3096 sock_put(other); 3097 3098 return 0; 3099 } 3100 3101 long unix_inq_len(struct sock *sk) 3102 { 3103 struct sk_buff *skb; 3104 long amount = 0; 3105 3106 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3107 return -EINVAL; 3108 3109 spin_lock(&sk->sk_receive_queue.lock); 3110 if (sk->sk_type == SOCK_STREAM || 3111 sk->sk_type == SOCK_SEQPACKET) { 3112 skb_queue_walk(&sk->sk_receive_queue, skb) 3113 amount += unix_skb_len(skb); 3114 } else { 3115 skb = skb_peek(&sk->sk_receive_queue); 3116 if (skb) 3117 amount = skb->len; 3118 } 3119 spin_unlock(&sk->sk_receive_queue.lock); 3120 3121 return amount; 3122 } 3123 EXPORT_SYMBOL_GPL(unix_inq_len); 3124 3125 long unix_outq_len(struct sock *sk) 3126 { 3127 return sk_wmem_alloc_get(sk); 3128 } 3129 EXPORT_SYMBOL_GPL(unix_outq_len); 3130 3131 static int unix_open_file(struct sock *sk) 3132 { 3133 struct path path; 3134 struct file *f; 3135 int fd; 3136 3137 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3138 return -EPERM; 3139 3140 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3141 return -ENOENT; 3142 3143 path = unix_sk(sk)->path; 3144 if (!path.dentry) 3145 return -ENOENT; 3146 3147 path_get(&path); 3148 3149 fd = get_unused_fd_flags(O_CLOEXEC); 3150 if (fd < 0) 3151 goto out; 3152 3153 f = dentry_open(&path, O_PATH, current_cred()); 3154 if (IS_ERR(f)) { 3155 put_unused_fd(fd); 3156 fd = PTR_ERR(f); 3157 goto out; 3158 } 3159 3160 fd_install(fd, f); 3161 out: 3162 path_put(&path); 3163 3164 return fd; 3165 } 3166 3167 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3168 { 3169 struct sock *sk = sock->sk; 3170 long amount = 0; 3171 int err; 3172 3173 switch (cmd) { 3174 case SIOCOUTQ: 3175 amount = unix_outq_len(sk); 3176 err = put_user(amount, (int __user *)arg); 3177 break; 3178 case SIOCINQ: 3179 amount = unix_inq_len(sk); 3180 if (amount < 0) 3181 err = amount; 3182 else 3183 err = put_user(amount, (int __user *)arg); 3184 break; 3185 case SIOCUNIXFILE: 3186 err = unix_open_file(sk); 3187 break; 3188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3189 case SIOCATMARK: 3190 { 3191 struct unix_sock *u = unix_sk(sk); 3192 struct sk_buff *skb; 3193 int answ = 0; 3194 3195 mutex_lock(&u->iolock); 3196 3197 skb = skb_peek(&sk->sk_receive_queue); 3198 if (skb) { 3199 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3200 3201 if (skb == oob_skb || 3202 (!oob_skb && !unix_skb_len(skb))) 3203 answ = 1; 3204 } 3205 3206 mutex_unlock(&u->iolock); 3207 3208 err = put_user(answ, (int __user *)arg); 3209 } 3210 break; 3211 #endif 3212 default: 3213 err = -ENOIOCTLCMD; 3214 break; 3215 } 3216 return err; 3217 } 3218 3219 #ifdef CONFIG_COMPAT 3220 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3221 { 3222 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3223 } 3224 #endif 3225 3226 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3227 { 3228 struct sock *sk = sock->sk; 3229 unsigned char state; 3230 __poll_t mask; 3231 u8 shutdown; 3232 3233 sock_poll_wait(file, sock, wait); 3234 mask = 0; 3235 shutdown = READ_ONCE(sk->sk_shutdown); 3236 state = READ_ONCE(sk->sk_state); 3237 3238 /* exceptional events? */ 3239 if (READ_ONCE(sk->sk_err)) 3240 mask |= EPOLLERR; 3241 if (shutdown == SHUTDOWN_MASK) 3242 mask |= EPOLLHUP; 3243 if (shutdown & RCV_SHUTDOWN) 3244 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3245 3246 /* readable? */ 3247 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3248 mask |= EPOLLIN | EPOLLRDNORM; 3249 if (sk_is_readable(sk)) 3250 mask |= EPOLLIN | EPOLLRDNORM; 3251 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3252 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3253 mask |= EPOLLPRI; 3254 #endif 3255 3256 /* Connection-based need to check for termination and startup */ 3257 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3258 state == TCP_CLOSE) 3259 mask |= EPOLLHUP; 3260 3261 /* 3262 * we set writable also when the other side has shut down the 3263 * connection. This prevents stuck sockets. 3264 */ 3265 if (unix_writable(sk, state)) 3266 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3267 3268 return mask; 3269 } 3270 3271 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3272 poll_table *wait) 3273 { 3274 struct sock *sk = sock->sk, *other; 3275 unsigned int writable; 3276 unsigned char state; 3277 __poll_t mask; 3278 u8 shutdown; 3279 3280 sock_poll_wait(file, sock, wait); 3281 mask = 0; 3282 shutdown = READ_ONCE(sk->sk_shutdown); 3283 state = READ_ONCE(sk->sk_state); 3284 3285 /* exceptional events? */ 3286 if (READ_ONCE(sk->sk_err) || 3287 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3288 mask |= EPOLLERR | 3289 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3290 3291 if (shutdown & RCV_SHUTDOWN) 3292 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3293 if (shutdown == SHUTDOWN_MASK) 3294 mask |= EPOLLHUP; 3295 3296 /* readable? */ 3297 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3298 mask |= EPOLLIN | EPOLLRDNORM; 3299 if (sk_is_readable(sk)) 3300 mask |= EPOLLIN | EPOLLRDNORM; 3301 3302 /* Connection-based need to check for termination and startup */ 3303 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3304 mask |= EPOLLHUP; 3305 3306 /* No write status requested, avoid expensive OUT tests. */ 3307 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3308 return mask; 3309 3310 writable = unix_writable(sk, state); 3311 if (writable) { 3312 unix_state_lock(sk); 3313 3314 other = unix_peer(sk); 3315 if (other && unix_peer(other) != sk && 3316 unix_recvq_full_lockless(other) && 3317 unix_dgram_peer_wake_me(sk, other)) 3318 writable = 0; 3319 3320 unix_state_unlock(sk); 3321 } 3322 3323 if (writable) 3324 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3325 else 3326 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3327 3328 return mask; 3329 } 3330 3331 #ifdef CONFIG_PROC_FS 3332 3333 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3334 3335 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3336 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3337 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3338 3339 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3340 { 3341 unsigned long offset = get_offset(*pos); 3342 unsigned long bucket = get_bucket(*pos); 3343 unsigned long count = 0; 3344 struct sock *sk; 3345 3346 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3347 sk; sk = sk_next(sk)) { 3348 if (++count == offset) 3349 break; 3350 } 3351 3352 return sk; 3353 } 3354 3355 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3356 { 3357 unsigned long bucket = get_bucket(*pos); 3358 struct net *net = seq_file_net(seq); 3359 struct sock *sk; 3360 3361 while (bucket < UNIX_HASH_SIZE) { 3362 spin_lock(&net->unx.table.locks[bucket]); 3363 3364 sk = unix_from_bucket(seq, pos); 3365 if (sk) 3366 return sk; 3367 3368 spin_unlock(&net->unx.table.locks[bucket]); 3369 3370 *pos = set_bucket_offset(++bucket, 1); 3371 } 3372 3373 return NULL; 3374 } 3375 3376 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3377 loff_t *pos) 3378 { 3379 unsigned long bucket = get_bucket(*pos); 3380 3381 sk = sk_next(sk); 3382 if (sk) 3383 return sk; 3384 3385 3386 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3387 3388 *pos = set_bucket_offset(++bucket, 1); 3389 3390 return unix_get_first(seq, pos); 3391 } 3392 3393 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3394 { 3395 if (!*pos) 3396 return SEQ_START_TOKEN; 3397 3398 return unix_get_first(seq, pos); 3399 } 3400 3401 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3402 { 3403 ++*pos; 3404 3405 if (v == SEQ_START_TOKEN) 3406 return unix_get_first(seq, pos); 3407 3408 return unix_get_next(seq, v, pos); 3409 } 3410 3411 static void unix_seq_stop(struct seq_file *seq, void *v) 3412 { 3413 struct sock *sk = v; 3414 3415 if (sk) 3416 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3417 } 3418 3419 static int unix_seq_show(struct seq_file *seq, void *v) 3420 { 3421 3422 if (v == SEQ_START_TOKEN) 3423 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3424 "Inode Path\n"); 3425 else { 3426 struct sock *s = v; 3427 struct unix_sock *u = unix_sk(s); 3428 unix_state_lock(s); 3429 3430 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3431 s, 3432 refcount_read(&s->sk_refcnt), 3433 0, 3434 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3435 s->sk_type, 3436 s->sk_socket ? 3437 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3438 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3439 sock_i_ino(s)); 3440 3441 if (u->addr) { // under a hash table lock here 3442 int i, len; 3443 seq_putc(seq, ' '); 3444 3445 i = 0; 3446 len = u->addr->len - 3447 offsetof(struct sockaddr_un, sun_path); 3448 if (u->addr->name->sun_path[0]) { 3449 len--; 3450 } else { 3451 seq_putc(seq, '@'); 3452 i++; 3453 } 3454 for ( ; i < len; i++) 3455 seq_putc(seq, u->addr->name->sun_path[i] ?: 3456 '@'); 3457 } 3458 unix_state_unlock(s); 3459 seq_putc(seq, '\n'); 3460 } 3461 3462 return 0; 3463 } 3464 3465 static const struct seq_operations unix_seq_ops = { 3466 .start = unix_seq_start, 3467 .next = unix_seq_next, 3468 .stop = unix_seq_stop, 3469 .show = unix_seq_show, 3470 }; 3471 3472 #ifdef CONFIG_BPF_SYSCALL 3473 struct bpf_unix_iter_state { 3474 struct seq_net_private p; 3475 unsigned int cur_sk; 3476 unsigned int end_sk; 3477 unsigned int max_sk; 3478 struct sock **batch; 3479 bool st_bucket_done; 3480 }; 3481 3482 struct bpf_iter__unix { 3483 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3484 __bpf_md_ptr(struct unix_sock *, unix_sk); 3485 uid_t uid __aligned(8); 3486 }; 3487 3488 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3489 struct unix_sock *unix_sk, uid_t uid) 3490 { 3491 struct bpf_iter__unix ctx; 3492 3493 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3494 ctx.meta = meta; 3495 ctx.unix_sk = unix_sk; 3496 ctx.uid = uid; 3497 return bpf_iter_run_prog(prog, &ctx); 3498 } 3499 3500 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3501 3502 { 3503 struct bpf_unix_iter_state *iter = seq->private; 3504 unsigned int expected = 1; 3505 struct sock *sk; 3506 3507 sock_hold(start_sk); 3508 iter->batch[iter->end_sk++] = start_sk; 3509 3510 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3511 if (iter->end_sk < iter->max_sk) { 3512 sock_hold(sk); 3513 iter->batch[iter->end_sk++] = sk; 3514 } 3515 3516 expected++; 3517 } 3518 3519 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3520 3521 return expected; 3522 } 3523 3524 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3525 { 3526 while (iter->cur_sk < iter->end_sk) 3527 sock_put(iter->batch[iter->cur_sk++]); 3528 } 3529 3530 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3531 unsigned int new_batch_sz) 3532 { 3533 struct sock **new_batch; 3534 3535 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3536 GFP_USER | __GFP_NOWARN); 3537 if (!new_batch) 3538 return -ENOMEM; 3539 3540 bpf_iter_unix_put_batch(iter); 3541 kvfree(iter->batch); 3542 iter->batch = new_batch; 3543 iter->max_sk = new_batch_sz; 3544 3545 return 0; 3546 } 3547 3548 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3549 loff_t *pos) 3550 { 3551 struct bpf_unix_iter_state *iter = seq->private; 3552 unsigned int expected; 3553 bool resized = false; 3554 struct sock *sk; 3555 3556 if (iter->st_bucket_done) 3557 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3558 3559 again: 3560 /* Get a new batch */ 3561 iter->cur_sk = 0; 3562 iter->end_sk = 0; 3563 3564 sk = unix_get_first(seq, pos); 3565 if (!sk) 3566 return NULL; /* Done */ 3567 3568 expected = bpf_iter_unix_hold_batch(seq, sk); 3569 3570 if (iter->end_sk == expected) { 3571 iter->st_bucket_done = true; 3572 return sk; 3573 } 3574 3575 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3576 resized = true; 3577 goto again; 3578 } 3579 3580 return sk; 3581 } 3582 3583 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3584 { 3585 if (!*pos) 3586 return SEQ_START_TOKEN; 3587 3588 /* bpf iter does not support lseek, so it always 3589 * continue from where it was stop()-ped. 3590 */ 3591 return bpf_iter_unix_batch(seq, pos); 3592 } 3593 3594 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3595 { 3596 struct bpf_unix_iter_state *iter = seq->private; 3597 struct sock *sk; 3598 3599 /* Whenever seq_next() is called, the iter->cur_sk is 3600 * done with seq_show(), so advance to the next sk in 3601 * the batch. 3602 */ 3603 if (iter->cur_sk < iter->end_sk) 3604 sock_put(iter->batch[iter->cur_sk++]); 3605 3606 ++*pos; 3607 3608 if (iter->cur_sk < iter->end_sk) 3609 sk = iter->batch[iter->cur_sk]; 3610 else 3611 sk = bpf_iter_unix_batch(seq, pos); 3612 3613 return sk; 3614 } 3615 3616 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3617 { 3618 struct bpf_iter_meta meta; 3619 struct bpf_prog *prog; 3620 struct sock *sk = v; 3621 uid_t uid; 3622 bool slow; 3623 int ret; 3624 3625 if (v == SEQ_START_TOKEN) 3626 return 0; 3627 3628 slow = lock_sock_fast(sk); 3629 3630 if (unlikely(sk_unhashed(sk))) { 3631 ret = SEQ_SKIP; 3632 goto unlock; 3633 } 3634 3635 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3636 meta.seq = seq; 3637 prog = bpf_iter_get_info(&meta, false); 3638 ret = unix_prog_seq_show(prog, &meta, v, uid); 3639 unlock: 3640 unlock_sock_fast(sk, slow); 3641 return ret; 3642 } 3643 3644 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3645 { 3646 struct bpf_unix_iter_state *iter = seq->private; 3647 struct bpf_iter_meta meta; 3648 struct bpf_prog *prog; 3649 3650 if (!v) { 3651 meta.seq = seq; 3652 prog = bpf_iter_get_info(&meta, true); 3653 if (prog) 3654 (void)unix_prog_seq_show(prog, &meta, v, 0); 3655 } 3656 3657 if (iter->cur_sk < iter->end_sk) 3658 bpf_iter_unix_put_batch(iter); 3659 } 3660 3661 static const struct seq_operations bpf_iter_unix_seq_ops = { 3662 .start = bpf_iter_unix_seq_start, 3663 .next = bpf_iter_unix_seq_next, 3664 .stop = bpf_iter_unix_seq_stop, 3665 .show = bpf_iter_unix_seq_show, 3666 }; 3667 #endif 3668 #endif 3669 3670 static const struct net_proto_family unix_family_ops = { 3671 .family = PF_UNIX, 3672 .create = unix_create, 3673 .owner = THIS_MODULE, 3674 }; 3675 3676 3677 static int __net_init unix_net_init(struct net *net) 3678 { 3679 int i; 3680 3681 net->unx.sysctl_max_dgram_qlen = 10; 3682 if (unix_sysctl_register(net)) 3683 goto out; 3684 3685 #ifdef CONFIG_PROC_FS 3686 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3687 sizeof(struct seq_net_private))) 3688 goto err_sysctl; 3689 #endif 3690 3691 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3692 sizeof(spinlock_t), GFP_KERNEL); 3693 if (!net->unx.table.locks) 3694 goto err_proc; 3695 3696 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3697 sizeof(struct hlist_head), 3698 GFP_KERNEL); 3699 if (!net->unx.table.buckets) 3700 goto free_locks; 3701 3702 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3703 spin_lock_init(&net->unx.table.locks[i]); 3704 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3705 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3706 } 3707 3708 return 0; 3709 3710 free_locks: 3711 kvfree(net->unx.table.locks); 3712 err_proc: 3713 #ifdef CONFIG_PROC_FS 3714 remove_proc_entry("unix", net->proc_net); 3715 err_sysctl: 3716 #endif 3717 unix_sysctl_unregister(net); 3718 out: 3719 return -ENOMEM; 3720 } 3721 3722 static void __net_exit unix_net_exit(struct net *net) 3723 { 3724 kvfree(net->unx.table.buckets); 3725 kvfree(net->unx.table.locks); 3726 unix_sysctl_unregister(net); 3727 remove_proc_entry("unix", net->proc_net); 3728 } 3729 3730 static struct pernet_operations unix_net_ops = { 3731 .init = unix_net_init, 3732 .exit = unix_net_exit, 3733 }; 3734 3735 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3736 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3737 struct unix_sock *unix_sk, uid_t uid) 3738 3739 #define INIT_BATCH_SZ 16 3740 3741 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3742 { 3743 struct bpf_unix_iter_state *iter = priv_data; 3744 int err; 3745 3746 err = bpf_iter_init_seq_net(priv_data, aux); 3747 if (err) 3748 return err; 3749 3750 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3751 if (err) { 3752 bpf_iter_fini_seq_net(priv_data); 3753 return err; 3754 } 3755 3756 return 0; 3757 } 3758 3759 static void bpf_iter_fini_unix(void *priv_data) 3760 { 3761 struct bpf_unix_iter_state *iter = priv_data; 3762 3763 bpf_iter_fini_seq_net(priv_data); 3764 kvfree(iter->batch); 3765 } 3766 3767 static const struct bpf_iter_seq_info unix_seq_info = { 3768 .seq_ops = &bpf_iter_unix_seq_ops, 3769 .init_seq_private = bpf_iter_init_unix, 3770 .fini_seq_private = bpf_iter_fini_unix, 3771 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3772 }; 3773 3774 static const struct bpf_func_proto * 3775 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3776 const struct bpf_prog *prog) 3777 { 3778 switch (func_id) { 3779 case BPF_FUNC_setsockopt: 3780 return &bpf_sk_setsockopt_proto; 3781 case BPF_FUNC_getsockopt: 3782 return &bpf_sk_getsockopt_proto; 3783 default: 3784 return NULL; 3785 } 3786 } 3787 3788 static struct bpf_iter_reg unix_reg_info = { 3789 .target = "unix", 3790 .ctx_arg_info_size = 1, 3791 .ctx_arg_info = { 3792 { offsetof(struct bpf_iter__unix, unix_sk), 3793 PTR_TO_BTF_ID_OR_NULL }, 3794 }, 3795 .get_func_proto = bpf_iter_unix_get_func_proto, 3796 .seq_info = &unix_seq_info, 3797 }; 3798 3799 static void __init bpf_iter_register(void) 3800 { 3801 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3802 if (bpf_iter_reg_target(&unix_reg_info)) 3803 pr_warn("Warning: could not register bpf iterator unix\n"); 3804 } 3805 #endif 3806 3807 static int __init af_unix_init(void) 3808 { 3809 int i, rc = -1; 3810 3811 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3812 3813 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3814 spin_lock_init(&bsd_socket_locks[i]); 3815 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3816 } 3817 3818 rc = proto_register(&unix_dgram_proto, 1); 3819 if (rc != 0) { 3820 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3821 goto out; 3822 } 3823 3824 rc = proto_register(&unix_stream_proto, 1); 3825 if (rc != 0) { 3826 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3827 proto_unregister(&unix_dgram_proto); 3828 goto out; 3829 } 3830 3831 sock_register(&unix_family_ops); 3832 register_pernet_subsys(&unix_net_ops); 3833 unix_bpf_build_proto(); 3834 3835 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3836 bpf_iter_register(); 3837 #endif 3838 3839 out: 3840 return rc; 3841 } 3842 3843 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3844 fs_initcall(af_unix_init); 3845
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.