~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/unix/af_unix.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * NET4:        Implementation of BSD Unix domain sockets.
  4  *
  5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
  6  *
  7  * Fixes:
  8  *              Linus Torvalds  :       Assorted bug cures.
  9  *              Niibe Yutaka    :       async I/O support.
 10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
 11  *              Alan Cox        :       Limit size of allocated blocks.
 12  *              Alan Cox        :       Fixed the stupid socketpair bug.
 13  *              Alan Cox        :       BSD compatibility fine tuning.
 14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
 15  *              Alan Cox        :       Sorted out a proper draft version of
 16  *                                      file descriptor passing hacked up from
 17  *                                      Mike Shaver's work.
 18  *              Marty Leisner   :       Fixes to fd passing
 19  *              Nick Nevin      :       recvmsg bugfix.
 20  *              Alan Cox        :       Started proper garbage collector
 21  *              Heiko EiBfeldt  :       Missing verify_area check
 22  *              Alan Cox        :       Started POSIXisms
 23  *              Andreas Schwab  :       Replace inode by dentry for proper
 24  *                                      reference counting
 25  *              Kirk Petersen   :       Made this a module
 26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
 27  *                                      Lots of bug fixes.
 28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
 29  *                                      by above two patches.
 30  *           Andrea Arcangeli   :       If possible we block in connect(2)
 31  *                                      if the max backlog of the listen socket
 32  *                                      is been reached. This won't break
 33  *                                      old apps and it will avoid huge amount
 34  *                                      of socks hashed (this for unix_gc()
 35  *                                      performances reasons).
 36  *                                      Security fix that limits the max
 37  *                                      number of socks to 2*max_files and
 38  *                                      the number of skb queueable in the
 39  *                                      dgram receiver.
 40  *              Artur Skawina   :       Hash function optimizations
 41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
 42  *            Malcolm Beattie   :       Set peercred for socketpair
 43  *           Michal Ostrowski   :       Module initialization cleanup.
 44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
 45  *                                      the core infrastructure is doing that
 46  *                                      for all net proto families now (2.5.69+)
 47  *
 48  * Known differences from reference BSD that was tested:
 49  *
 50  *      [TO FIX]
 51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
 52  *              other the moment one end closes.
 53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
 54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
 55  *      [NOT TO FIX]
 56  *      accept() returns a path name even if the connecting socket has closed
 57  *              in the meantime (BSD loses the path and gives up).
 58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
 59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
 60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
 61  *      BSD af_unix apparently has connect forgetting to block properly.
 62  *              (need to check this with the POSIX spec in detail)
 63  *
 64  * Differences from 2.0.0-11-... (ANK)
 65  *      Bug fixes and improvements.
 66  *              - client shutdown killed server socket.
 67  *              - removed all useless cli/sti pairs.
 68  *
 69  *      Semantic changes/extensions.
 70  *              - generic control message passing.
 71  *              - SCM_CREDENTIALS control message.
 72  *              - "Abstract" (not FS based) socket bindings.
 73  *                Abstract names are sequences of bytes (not zero terminated)
 74  *                started by 0, so that this name space does not intersect
 75  *                with BSD names.
 76  */
 77 
 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 79 
 80 #include <linux/module.h>
 81 #include <linux/kernel.h>
 82 #include <linux/signal.h>
 83 #include <linux/sched/signal.h>
 84 #include <linux/errno.h>
 85 #include <linux/string.h>
 86 #include <linux/stat.h>
 87 #include <linux/dcache.h>
 88 #include <linux/namei.h>
 89 #include <linux/socket.h>
 90 #include <linux/un.h>
 91 #include <linux/fcntl.h>
 92 #include <linux/filter.h>
 93 #include <linux/termios.h>
 94 #include <linux/sockios.h>
 95 #include <linux/net.h>
 96 #include <linux/in.h>
 97 #include <linux/fs.h>
 98 #include <linux/slab.h>
 99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r)   (((l) > (r)) - ((l) < (r)))
131 
132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133                                   const struct lockdep_map *b)
134 {
135         return cmp_ptr(a, b);
136 }
137 
138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139                                   const struct lockdep_map *_b)
140 {
141         const struct unix_sock *a, *b;
142 
143         a = container_of(_a, struct unix_sock, lock.dep_map);
144         b = container_of(_b, struct unix_sock, lock.dep_map);
145 
146         if (a->sk.sk_state == TCP_LISTEN) {
147                 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
148                  *
149                  *   1. a is TCP_LISTEN.
150                  *   2. b is not a.
151                  *   3. concurrent connect(b -> a) must fail.
152                  *
153                  * Except for 2. & 3., the b's state can be any possible
154                  * value due to concurrent connect() or listen().
155                  *
156                  * 2. is detected in debug_spin_lock_before(), and 3. cannot
157                  * be expressed as lock_cmp_fn.
158                  */
159                 switch (b->sk.sk_state) {
160                 case TCP_CLOSE:
161                 case TCP_ESTABLISHED:
162                 case TCP_LISTEN:
163                         return -1;
164                 default:
165                         /* Invalid case. */
166                         return 0;
167                 }
168         }
169 
170         /* Should never happen.  Just to be symmetric. */
171         if (b->sk.sk_state == TCP_LISTEN) {
172                 switch (b->sk.sk_state) {
173                 case TCP_CLOSE:
174                 case TCP_ESTABLISHED:
175                         return 1;
176                 default:
177                         return 0;
178                 }
179         }
180 
181         /* unix_state_double_lock(): ascending address order. */
182         return cmp_ptr(a, b);
183 }
184 
185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186                                   const struct lockdep_map *_b)
187 {
188         const struct sock *a, *b;
189 
190         a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191         b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192 
193         /* unix_collect_skb(): listener -> embryo order. */
194         if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195                 return -1;
196 
197         /* Should never happen.  Just to be symmetric. */
198         if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199                 return 1;
200 
201         return 0;
202 }
203 #endif
204 
205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207         unsigned long hash = (unsigned long)sk;
208 
209         hash ^= hash >> 16;
210         hash ^= hash >> 8;
211         hash ^= sk->sk_type;
212 
213         return hash & UNIX_HASH_MOD;
214 }
215 
216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218         return i->i_ino & UNIX_HASH_MOD;
219 }
220 
221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222                                        int addr_len, int type)
223 {
224         __wsum csum = csum_partial(sunaddr, addr_len, 0);
225         unsigned int hash;
226 
227         hash = (__force unsigned int)csum_fold(csum);
228         hash ^= hash >> 8;
229         hash ^= type;
230 
231         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233 
234 static void unix_table_double_lock(struct net *net,
235                                    unsigned int hash1, unsigned int hash2)
236 {
237         if (hash1 == hash2) {
238                 spin_lock(&net->unx.table.locks[hash1]);
239                 return;
240         }
241 
242         if (hash1 > hash2)
243                 swap(hash1, hash2);
244 
245         spin_lock(&net->unx.table.locks[hash1]);
246         spin_lock(&net->unx.table.locks[hash2]);
247 }
248 
249 static void unix_table_double_unlock(struct net *net,
250                                      unsigned int hash1, unsigned int hash2)
251 {
252         if (hash1 == hash2) {
253                 spin_unlock(&net->unx.table.locks[hash1]);
254                 return;
255         }
256 
257         spin_unlock(&net->unx.table.locks[hash1]);
258         spin_unlock(&net->unx.table.locks[hash2]);
259 }
260 
261 #ifdef CONFIG_SECURITY_NETWORK
262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264         UNIXCB(skb).secid = scm->secid;
265 }
266 
267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269         scm->secid = UNIXCB(skb).secid;
270 }
271 
272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274         return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279 
280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282 
283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285         return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288 
289 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
290 {
291         return unix_peer(osk) == sk;
292 }
293 
294 static inline int unix_may_send(struct sock *sk, struct sock *osk)
295 {
296         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
297 }
298 
299 static inline int unix_recvq_full_lockless(const struct sock *sk)
300 {
301         return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
302 }
303 
304 struct sock *unix_peer_get(struct sock *s)
305 {
306         struct sock *peer;
307 
308         unix_state_lock(s);
309         peer = unix_peer(s);
310         if (peer)
311                 sock_hold(peer);
312         unix_state_unlock(s);
313         return peer;
314 }
315 EXPORT_SYMBOL_GPL(unix_peer_get);
316 
317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
318                                              int addr_len)
319 {
320         struct unix_address *addr;
321 
322         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
323         if (!addr)
324                 return NULL;
325 
326         refcount_set(&addr->refcnt, 1);
327         addr->len = addr_len;
328         memcpy(addr->name, sunaddr, addr_len);
329 
330         return addr;
331 }
332 
333 static inline void unix_release_addr(struct unix_address *addr)
334 {
335         if (refcount_dec_and_test(&addr->refcnt))
336                 kfree(addr);
337 }
338 
339 /*
340  *      Check unix socket name:
341  *              - should be not zero length.
342  *              - if started by not zero, should be NULL terminated (FS object)
343  *              - if started by zero, it is abstract name.
344  */
345 
346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
347 {
348         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
349             addr_len > sizeof(*sunaddr))
350                 return -EINVAL;
351 
352         if (sunaddr->sun_family != AF_UNIX)
353                 return -EINVAL;
354 
355         return 0;
356 }
357 
358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
359 {
360         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
361         short offset = offsetof(struct sockaddr_storage, __data);
362 
363         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
364 
365         /* This may look like an off by one error but it is a bit more
366          * subtle.  108 is the longest valid AF_UNIX path for a binding.
367          * sun_path[108] doesn't as such exist.  However in kernel space
368          * we are guaranteed that it is a valid memory location in our
369          * kernel address buffer because syscall functions always pass
370          * a pointer of struct sockaddr_storage which has a bigger buffer
371          * than 108.  Also, we must terminate sun_path for strlen() in
372          * getname_kernel().
373          */
374         addr->__data[addr_len - offset] = 0;
375 
376         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
377          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
378          * know the actual buffer.
379          */
380         return strlen(addr->__data) + offset + 1;
381 }
382 
383 static void __unix_remove_socket(struct sock *sk)
384 {
385         sk_del_node_init(sk);
386 }
387 
388 static void __unix_insert_socket(struct net *net, struct sock *sk)
389 {
390         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
391         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
392 }
393 
394 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
395                                  struct unix_address *addr, unsigned int hash)
396 {
397         __unix_remove_socket(sk);
398         smp_store_release(&unix_sk(sk)->addr, addr);
399 
400         sk->sk_hash = hash;
401         __unix_insert_socket(net, sk);
402 }
403 
404 static void unix_remove_socket(struct net *net, struct sock *sk)
405 {
406         spin_lock(&net->unx.table.locks[sk->sk_hash]);
407         __unix_remove_socket(sk);
408         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
409 }
410 
411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
412 {
413         spin_lock(&net->unx.table.locks[sk->sk_hash]);
414         __unix_insert_socket(net, sk);
415         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
416 }
417 
418 static void unix_insert_bsd_socket(struct sock *sk)
419 {
420         spin_lock(&bsd_socket_locks[sk->sk_hash]);
421         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
422         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
423 }
424 
425 static void unix_remove_bsd_socket(struct sock *sk)
426 {
427         if (!hlist_unhashed(&sk->sk_bind_node)) {
428                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
429                 __sk_del_bind_node(sk);
430                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
431 
432                 sk_node_init(&sk->sk_bind_node);
433         }
434 }
435 
436 static struct sock *__unix_find_socket_byname(struct net *net,
437                                               struct sockaddr_un *sunname,
438                                               int len, unsigned int hash)
439 {
440         struct sock *s;
441 
442         sk_for_each(s, &net->unx.table.buckets[hash]) {
443                 struct unix_sock *u = unix_sk(s);
444 
445                 if (u->addr->len == len &&
446                     !memcmp(u->addr->name, sunname, len))
447                         return s;
448         }
449         return NULL;
450 }
451 
452 static inline struct sock *unix_find_socket_byname(struct net *net,
453                                                    struct sockaddr_un *sunname,
454                                                    int len, unsigned int hash)
455 {
456         struct sock *s;
457 
458         spin_lock(&net->unx.table.locks[hash]);
459         s = __unix_find_socket_byname(net, sunname, len, hash);
460         if (s)
461                 sock_hold(s);
462         spin_unlock(&net->unx.table.locks[hash]);
463         return s;
464 }
465 
466 static struct sock *unix_find_socket_byinode(struct inode *i)
467 {
468         unsigned int hash = unix_bsd_hash(i);
469         struct sock *s;
470 
471         spin_lock(&bsd_socket_locks[hash]);
472         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
473                 struct dentry *dentry = unix_sk(s)->path.dentry;
474 
475                 if (dentry && d_backing_inode(dentry) == i) {
476                         sock_hold(s);
477                         spin_unlock(&bsd_socket_locks[hash]);
478                         return s;
479                 }
480         }
481         spin_unlock(&bsd_socket_locks[hash]);
482         return NULL;
483 }
484 
485 /* Support code for asymmetrically connected dgram sockets
486  *
487  * If a datagram socket is connected to a socket not itself connected
488  * to the first socket (eg, /dev/log), clients may only enqueue more
489  * messages if the present receive queue of the server socket is not
490  * "too large". This means there's a second writeability condition
491  * poll and sendmsg need to test. The dgram recv code will do a wake
492  * up on the peer_wait wait queue of a socket upon reception of a
493  * datagram which needs to be propagated to sleeping would-be writers
494  * since these might not have sent anything so far. This can't be
495  * accomplished via poll_wait because the lifetime of the server
496  * socket might be less than that of its clients if these break their
497  * association with it or if the server socket is closed while clients
498  * are still connected to it and there's no way to inform "a polling
499  * implementation" that it should let go of a certain wait queue
500  *
501  * In order to propagate a wake up, a wait_queue_entry_t of the client
502  * socket is enqueued on the peer_wait queue of the server socket
503  * whose wake function does a wake_up on the ordinary client socket
504  * wait queue. This connection is established whenever a write (or
505  * poll for write) hit the flow control condition and broken when the
506  * association to the server socket is dissolved or after a wake up
507  * was relayed.
508  */
509 
510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
511                                       void *key)
512 {
513         struct unix_sock *u;
514         wait_queue_head_t *u_sleep;
515 
516         u = container_of(q, struct unix_sock, peer_wake);
517 
518         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
519                             q);
520         u->peer_wake.private = NULL;
521 
522         /* relaying can only happen while the wq still exists */
523         u_sleep = sk_sleep(&u->sk);
524         if (u_sleep)
525                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
526 
527         return 0;
528 }
529 
530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
531 {
532         struct unix_sock *u, *u_other;
533         int rc;
534 
535         u = unix_sk(sk);
536         u_other = unix_sk(other);
537         rc = 0;
538         spin_lock(&u_other->peer_wait.lock);
539 
540         if (!u->peer_wake.private) {
541                 u->peer_wake.private = other;
542                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
543 
544                 rc = 1;
545         }
546 
547         spin_unlock(&u_other->peer_wait.lock);
548         return rc;
549 }
550 
551 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
552                                             struct sock *other)
553 {
554         struct unix_sock *u, *u_other;
555 
556         u = unix_sk(sk);
557         u_other = unix_sk(other);
558         spin_lock(&u_other->peer_wait.lock);
559 
560         if (u->peer_wake.private == other) {
561                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
562                 u->peer_wake.private = NULL;
563         }
564 
565         spin_unlock(&u_other->peer_wait.lock);
566 }
567 
568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
569                                                    struct sock *other)
570 {
571         unix_dgram_peer_wake_disconnect(sk, other);
572         wake_up_interruptible_poll(sk_sleep(sk),
573                                    EPOLLOUT |
574                                    EPOLLWRNORM |
575                                    EPOLLWRBAND);
576 }
577 
578 /* preconditions:
579  *      - unix_peer(sk) == other
580  *      - association is stable
581  */
582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
583 {
584         int connected;
585 
586         connected = unix_dgram_peer_wake_connect(sk, other);
587 
588         /* If other is SOCK_DEAD, we want to make sure we signal
589          * POLLOUT, such that a subsequent write() can get a
590          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
591          * to other and its full, we will hang waiting for POLLOUT.
592          */
593         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
594                 return 1;
595 
596         if (connected)
597                 unix_dgram_peer_wake_disconnect(sk, other);
598 
599         return 0;
600 }
601 
602 static int unix_writable(const struct sock *sk, unsigned char state)
603 {
604         return state != TCP_LISTEN &&
605                 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
606 }
607 
608 static void unix_write_space(struct sock *sk)
609 {
610         struct socket_wq *wq;
611 
612         rcu_read_lock();
613         if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
614                 wq = rcu_dereference(sk->sk_wq);
615                 if (skwq_has_sleeper(wq))
616                         wake_up_interruptible_sync_poll(&wq->wait,
617                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
618                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
619         }
620         rcu_read_unlock();
621 }
622 
623 /* When dgram socket disconnects (or changes its peer), we clear its receive
624  * queue of packets arrived from previous peer. First, it allows to do
625  * flow control based only on wmem_alloc; second, sk connected to peer
626  * may receive messages only from that peer. */
627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
628 {
629         if (!skb_queue_empty(&sk->sk_receive_queue)) {
630                 skb_queue_purge(&sk->sk_receive_queue);
631                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
632 
633                 /* If one link of bidirectional dgram pipe is disconnected,
634                  * we signal error. Messages are lost. Do not make this,
635                  * when peer was not connected to us.
636                  */
637                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
638                         WRITE_ONCE(other->sk_err, ECONNRESET);
639                         sk_error_report(other);
640                 }
641         }
642 }
643 
644 static void unix_sock_destructor(struct sock *sk)
645 {
646         struct unix_sock *u = unix_sk(sk);
647 
648         skb_queue_purge(&sk->sk_receive_queue);
649 
650         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
651         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
652         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
653         if (!sock_flag(sk, SOCK_DEAD)) {
654                 pr_info("Attempt to release alive unix socket: %p\n", sk);
655                 return;
656         }
657 
658         if (u->addr)
659                 unix_release_addr(u->addr);
660 
661         atomic_long_dec(&unix_nr_socks);
662         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
663 #ifdef UNIX_REFCNT_DEBUG
664         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
665                 atomic_long_read(&unix_nr_socks));
666 #endif
667 }
668 
669 static void unix_release_sock(struct sock *sk, int embrion)
670 {
671         struct unix_sock *u = unix_sk(sk);
672         struct sock *skpair;
673         struct sk_buff *skb;
674         struct path path;
675         int state;
676 
677         unix_remove_socket(sock_net(sk), sk);
678         unix_remove_bsd_socket(sk);
679 
680         /* Clear state */
681         unix_state_lock(sk);
682         sock_orphan(sk);
683         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
684         path         = u->path;
685         u->path.dentry = NULL;
686         u->path.mnt = NULL;
687         state = sk->sk_state;
688         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
689 
690         skpair = unix_peer(sk);
691         unix_peer(sk) = NULL;
692 
693         unix_state_unlock(sk);
694 
695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
696         if (u->oob_skb) {
697                 kfree_skb(u->oob_skb);
698                 u->oob_skb = NULL;
699         }
700 #endif
701 
702         wake_up_interruptible_all(&u->peer_wait);
703 
704         if (skpair != NULL) {
705                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
706                         unix_state_lock(skpair);
707                         /* No more writes */
708                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
709                         if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
710                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
711                         unix_state_unlock(skpair);
712                         skpair->sk_state_change(skpair);
713                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
714                 }
715 
716                 unix_dgram_peer_wake_disconnect(sk, skpair);
717                 sock_put(skpair); /* It may now die */
718         }
719 
720         /* Try to flush out this socket. Throw out buffers at least */
721 
722         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
723                 if (state == TCP_LISTEN)
724                         unix_release_sock(skb->sk, 1);
725 
726                 /* passed fds are erased in the kfree_skb hook        */
727                 kfree_skb(skb);
728         }
729 
730         if (path.dentry)
731                 path_put(&path);
732 
733         sock_put(sk);
734 
735         /* ---- Socket is dead now and most probably destroyed ---- */
736 
737         /*
738          * Fixme: BSD difference: In BSD all sockets connected to us get
739          *        ECONNRESET and we die on the spot. In Linux we behave
740          *        like files and pipes do and wait for the last
741          *        dereference.
742          *
743          * Can't we simply set sock->err?
744          *
745          *        What the above comment does talk about? --ANK(980817)
746          */
747 
748         if (READ_ONCE(unix_tot_inflight))
749                 unix_gc();              /* Garbage collect fds */
750 }
751 
752 static void init_peercred(struct sock *sk)
753 {
754         sk->sk_peer_pid = get_pid(task_tgid(current));
755         sk->sk_peer_cred = get_current_cred();
756 }
757 
758 static void update_peercred(struct sock *sk)
759 {
760         const struct cred *old_cred;
761         struct pid *old_pid;
762 
763         spin_lock(&sk->sk_peer_lock);
764         old_pid = sk->sk_peer_pid;
765         old_cred = sk->sk_peer_cred;
766         init_peercred(sk);
767         spin_unlock(&sk->sk_peer_lock);
768 
769         put_pid(old_pid);
770         put_cred(old_cred);
771 }
772 
773 static void copy_peercred(struct sock *sk, struct sock *peersk)
774 {
775         lockdep_assert_held(&unix_sk(peersk)->lock);
776 
777         spin_lock(&sk->sk_peer_lock);
778         sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
779         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
780         spin_unlock(&sk->sk_peer_lock);
781 }
782 
783 static int unix_listen(struct socket *sock, int backlog)
784 {
785         int err;
786         struct sock *sk = sock->sk;
787         struct unix_sock *u = unix_sk(sk);
788 
789         err = -EOPNOTSUPP;
790         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
791                 goto out;       /* Only stream/seqpacket sockets accept */
792         err = -EINVAL;
793         if (!READ_ONCE(u->addr))
794                 goto out;       /* No listens on an unbound socket */
795         unix_state_lock(sk);
796         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
797                 goto out_unlock;
798         if (backlog > sk->sk_max_ack_backlog)
799                 wake_up_interruptible_all(&u->peer_wait);
800         sk->sk_max_ack_backlog  = backlog;
801         WRITE_ONCE(sk->sk_state, TCP_LISTEN);
802 
803         /* set credentials so connect can copy them */
804         update_peercred(sk);
805         err = 0;
806 
807 out_unlock:
808         unix_state_unlock(sk);
809 out:
810         return err;
811 }
812 
813 static int unix_release(struct socket *);
814 static int unix_bind(struct socket *, struct sockaddr *, int);
815 static int unix_stream_connect(struct socket *, struct sockaddr *,
816                                int addr_len, int flags);
817 static int unix_socketpair(struct socket *, struct socket *);
818 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
819 static int unix_getname(struct socket *, struct sockaddr *, int);
820 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
821 static __poll_t unix_dgram_poll(struct file *, struct socket *,
822                                     poll_table *);
823 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
824 #ifdef CONFIG_COMPAT
825 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
826 #endif
827 static int unix_shutdown(struct socket *, int);
828 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
829 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
830 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
831                                        struct pipe_inode_info *, size_t size,
832                                        unsigned int flags);
833 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
834 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
835 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
836 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
837 static int unix_dgram_connect(struct socket *, struct sockaddr *,
838                               int, int);
839 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
840 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
841                                   int);
842 
843 #ifdef CONFIG_PROC_FS
844 static int unix_count_nr_fds(struct sock *sk)
845 {
846         struct sk_buff *skb;
847         struct unix_sock *u;
848         int nr_fds = 0;
849 
850         spin_lock(&sk->sk_receive_queue.lock);
851         skb = skb_peek(&sk->sk_receive_queue);
852         while (skb) {
853                 u = unix_sk(skb->sk);
854                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
855                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
856         }
857         spin_unlock(&sk->sk_receive_queue.lock);
858 
859         return nr_fds;
860 }
861 
862 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
863 {
864         struct sock *sk = sock->sk;
865         unsigned char s_state;
866         struct unix_sock *u;
867         int nr_fds = 0;
868 
869         if (sk) {
870                 s_state = READ_ONCE(sk->sk_state);
871                 u = unix_sk(sk);
872 
873                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
874                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
875                  * SOCK_DGRAM is ordinary. So, no lock is needed.
876                  */
877                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
878                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
879                 else if (s_state == TCP_LISTEN)
880                         nr_fds = unix_count_nr_fds(sk);
881 
882                 seq_printf(m, "scm_fds: %u\n", nr_fds);
883         }
884 }
885 #else
886 #define unix_show_fdinfo NULL
887 #endif
888 
889 static const struct proto_ops unix_stream_ops = {
890         .family =       PF_UNIX,
891         .owner =        THIS_MODULE,
892         .release =      unix_release,
893         .bind =         unix_bind,
894         .connect =      unix_stream_connect,
895         .socketpair =   unix_socketpair,
896         .accept =       unix_accept,
897         .getname =      unix_getname,
898         .poll =         unix_poll,
899         .ioctl =        unix_ioctl,
900 #ifdef CONFIG_COMPAT
901         .compat_ioctl = unix_compat_ioctl,
902 #endif
903         .listen =       unix_listen,
904         .shutdown =     unix_shutdown,
905         .sendmsg =      unix_stream_sendmsg,
906         .recvmsg =      unix_stream_recvmsg,
907         .read_skb =     unix_stream_read_skb,
908         .mmap =         sock_no_mmap,
909         .splice_read =  unix_stream_splice_read,
910         .set_peek_off = sk_set_peek_off,
911         .show_fdinfo =  unix_show_fdinfo,
912 };
913 
914 static const struct proto_ops unix_dgram_ops = {
915         .family =       PF_UNIX,
916         .owner =        THIS_MODULE,
917         .release =      unix_release,
918         .bind =         unix_bind,
919         .connect =      unix_dgram_connect,
920         .socketpair =   unix_socketpair,
921         .accept =       sock_no_accept,
922         .getname =      unix_getname,
923         .poll =         unix_dgram_poll,
924         .ioctl =        unix_ioctl,
925 #ifdef CONFIG_COMPAT
926         .compat_ioctl = unix_compat_ioctl,
927 #endif
928         .listen =       sock_no_listen,
929         .shutdown =     unix_shutdown,
930         .sendmsg =      unix_dgram_sendmsg,
931         .read_skb =     unix_read_skb,
932         .recvmsg =      unix_dgram_recvmsg,
933         .mmap =         sock_no_mmap,
934         .set_peek_off = sk_set_peek_off,
935         .show_fdinfo =  unix_show_fdinfo,
936 };
937 
938 static const struct proto_ops unix_seqpacket_ops = {
939         .family =       PF_UNIX,
940         .owner =        THIS_MODULE,
941         .release =      unix_release,
942         .bind =         unix_bind,
943         .connect =      unix_stream_connect,
944         .socketpair =   unix_socketpair,
945         .accept =       unix_accept,
946         .getname =      unix_getname,
947         .poll =         unix_dgram_poll,
948         .ioctl =        unix_ioctl,
949 #ifdef CONFIG_COMPAT
950         .compat_ioctl = unix_compat_ioctl,
951 #endif
952         .listen =       unix_listen,
953         .shutdown =     unix_shutdown,
954         .sendmsg =      unix_seqpacket_sendmsg,
955         .recvmsg =      unix_seqpacket_recvmsg,
956         .mmap =         sock_no_mmap,
957         .set_peek_off = sk_set_peek_off,
958         .show_fdinfo =  unix_show_fdinfo,
959 };
960 
961 static void unix_close(struct sock *sk, long timeout)
962 {
963         /* Nothing to do here, unix socket does not need a ->close().
964          * This is merely for sockmap.
965          */
966 }
967 
968 static void unix_unhash(struct sock *sk)
969 {
970         /* Nothing to do here, unix socket does not need a ->unhash().
971          * This is merely for sockmap.
972          */
973 }
974 
975 static bool unix_bpf_bypass_getsockopt(int level, int optname)
976 {
977         if (level == SOL_SOCKET) {
978                 switch (optname) {
979                 case SO_PEERPIDFD:
980                         return true;
981                 default:
982                         return false;
983                 }
984         }
985 
986         return false;
987 }
988 
989 struct proto unix_dgram_proto = {
990         .name                   = "UNIX",
991         .owner                  = THIS_MODULE,
992         .obj_size               = sizeof(struct unix_sock),
993         .close                  = unix_close,
994         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
995 #ifdef CONFIG_BPF_SYSCALL
996         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
997 #endif
998 };
999 
1000 struct proto unix_stream_proto = {
1001         .name                   = "UNIX-STREAM",
1002         .owner                  = THIS_MODULE,
1003         .obj_size               = sizeof(struct unix_sock),
1004         .close                  = unix_close,
1005         .unhash                 = unix_unhash,
1006         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
1007 #ifdef CONFIG_BPF_SYSCALL
1008         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
1009 #endif
1010 };
1011 
1012 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1013 {
1014         struct unix_sock *u;
1015         struct sock *sk;
1016         int err;
1017 
1018         atomic_long_inc(&unix_nr_socks);
1019         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1020                 err = -ENFILE;
1021                 goto err;
1022         }
1023 
1024         if (type == SOCK_STREAM)
1025                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1026         else /*dgram and  seqpacket */
1027                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1028 
1029         if (!sk) {
1030                 err = -ENOMEM;
1031                 goto err;
1032         }
1033 
1034         sock_init_data(sock, sk);
1035 
1036         sk->sk_hash             = unix_unbound_hash(sk);
1037         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
1038         sk->sk_write_space      = unix_write_space;
1039         sk->sk_max_ack_backlog  = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1040         sk->sk_destruct         = unix_sock_destructor;
1041         lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1042 
1043         u = unix_sk(sk);
1044         u->listener = NULL;
1045         u->vertex = NULL;
1046         u->path.dentry = NULL;
1047         u->path.mnt = NULL;
1048         spin_lock_init(&u->lock);
1049         lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1050         mutex_init(&u->iolock); /* single task reading lock */
1051         mutex_init(&u->bindlock); /* single task binding lock */
1052         init_waitqueue_head(&u->peer_wait);
1053         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1054         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1055         unix_insert_unbound_socket(net, sk);
1056 
1057         sock_prot_inuse_add(net, sk->sk_prot, 1);
1058 
1059         return sk;
1060 
1061 err:
1062         atomic_long_dec(&unix_nr_socks);
1063         return ERR_PTR(err);
1064 }
1065 
1066 static int unix_create(struct net *net, struct socket *sock, int protocol,
1067                        int kern)
1068 {
1069         struct sock *sk;
1070 
1071         if (protocol && protocol != PF_UNIX)
1072                 return -EPROTONOSUPPORT;
1073 
1074         sock->state = SS_UNCONNECTED;
1075 
1076         switch (sock->type) {
1077         case SOCK_STREAM:
1078                 sock->ops = &unix_stream_ops;
1079                 break;
1080                 /*
1081                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1082                  *      nothing uses it.
1083                  */
1084         case SOCK_RAW:
1085                 sock->type = SOCK_DGRAM;
1086                 fallthrough;
1087         case SOCK_DGRAM:
1088                 sock->ops = &unix_dgram_ops;
1089                 break;
1090         case SOCK_SEQPACKET:
1091                 sock->ops = &unix_seqpacket_ops;
1092                 break;
1093         default:
1094                 return -ESOCKTNOSUPPORT;
1095         }
1096 
1097         sk = unix_create1(net, sock, kern, sock->type);
1098         if (IS_ERR(sk))
1099                 return PTR_ERR(sk);
1100 
1101         return 0;
1102 }
1103 
1104 static int unix_release(struct socket *sock)
1105 {
1106         struct sock *sk = sock->sk;
1107 
1108         if (!sk)
1109                 return 0;
1110 
1111         sk->sk_prot->close(sk, 0);
1112         unix_release_sock(sk, 0);
1113         sock->sk = NULL;
1114 
1115         return 0;
1116 }
1117 
1118 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1119                                   int type)
1120 {
1121         struct inode *inode;
1122         struct path path;
1123         struct sock *sk;
1124         int err;
1125 
1126         unix_mkname_bsd(sunaddr, addr_len);
1127         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1128         if (err)
1129                 goto fail;
1130 
1131         err = path_permission(&path, MAY_WRITE);
1132         if (err)
1133                 goto path_put;
1134 
1135         err = -ECONNREFUSED;
1136         inode = d_backing_inode(path.dentry);
1137         if (!S_ISSOCK(inode->i_mode))
1138                 goto path_put;
1139 
1140         sk = unix_find_socket_byinode(inode);
1141         if (!sk)
1142                 goto path_put;
1143 
1144         err = -EPROTOTYPE;
1145         if (sk->sk_type == type)
1146                 touch_atime(&path);
1147         else
1148                 goto sock_put;
1149 
1150         path_put(&path);
1151 
1152         return sk;
1153 
1154 sock_put:
1155         sock_put(sk);
1156 path_put:
1157         path_put(&path);
1158 fail:
1159         return ERR_PTR(err);
1160 }
1161 
1162 static struct sock *unix_find_abstract(struct net *net,
1163                                        struct sockaddr_un *sunaddr,
1164                                        int addr_len, int type)
1165 {
1166         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1167         struct dentry *dentry;
1168         struct sock *sk;
1169 
1170         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1171         if (!sk)
1172                 return ERR_PTR(-ECONNREFUSED);
1173 
1174         dentry = unix_sk(sk)->path.dentry;
1175         if (dentry)
1176                 touch_atime(&unix_sk(sk)->path);
1177 
1178         return sk;
1179 }
1180 
1181 static struct sock *unix_find_other(struct net *net,
1182                                     struct sockaddr_un *sunaddr,
1183                                     int addr_len, int type)
1184 {
1185         struct sock *sk;
1186 
1187         if (sunaddr->sun_path[0])
1188                 sk = unix_find_bsd(sunaddr, addr_len, type);
1189         else
1190                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1191 
1192         return sk;
1193 }
1194 
1195 static int unix_autobind(struct sock *sk)
1196 {
1197         struct unix_sock *u = unix_sk(sk);
1198         unsigned int new_hash, old_hash;
1199         struct net *net = sock_net(sk);
1200         struct unix_address *addr;
1201         u32 lastnum, ordernum;
1202         int err;
1203 
1204         err = mutex_lock_interruptible(&u->bindlock);
1205         if (err)
1206                 return err;
1207 
1208         if (u->addr)
1209                 goto out;
1210 
1211         err = -ENOMEM;
1212         addr = kzalloc(sizeof(*addr) +
1213                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1214         if (!addr)
1215                 goto out;
1216 
1217         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1218         addr->name->sun_family = AF_UNIX;
1219         refcount_set(&addr->refcnt, 1);
1220 
1221         old_hash = sk->sk_hash;
1222         ordernum = get_random_u32();
1223         lastnum = ordernum & 0xFFFFF;
1224 retry:
1225         ordernum = (ordernum + 1) & 0xFFFFF;
1226         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1227 
1228         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1229         unix_table_double_lock(net, old_hash, new_hash);
1230 
1231         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1232                 unix_table_double_unlock(net, old_hash, new_hash);
1233 
1234                 /* __unix_find_socket_byname() may take long time if many names
1235                  * are already in use.
1236                  */
1237                 cond_resched();
1238 
1239                 if (ordernum == lastnum) {
1240                         /* Give up if all names seems to be in use. */
1241                         err = -ENOSPC;
1242                         unix_release_addr(addr);
1243                         goto out;
1244                 }
1245 
1246                 goto retry;
1247         }
1248 
1249         __unix_set_addr_hash(net, sk, addr, new_hash);
1250         unix_table_double_unlock(net, old_hash, new_hash);
1251         err = 0;
1252 
1253 out:    mutex_unlock(&u->bindlock);
1254         return err;
1255 }
1256 
1257 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1258                          int addr_len)
1259 {
1260         umode_t mode = S_IFSOCK |
1261                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1262         struct unix_sock *u = unix_sk(sk);
1263         unsigned int new_hash, old_hash;
1264         struct net *net = sock_net(sk);
1265         struct mnt_idmap *idmap;
1266         struct unix_address *addr;
1267         struct dentry *dentry;
1268         struct path parent;
1269         int err;
1270 
1271         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1272         addr = unix_create_addr(sunaddr, addr_len);
1273         if (!addr)
1274                 return -ENOMEM;
1275 
1276         /*
1277          * Get the parent directory, calculate the hash for last
1278          * component.
1279          */
1280         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1281         if (IS_ERR(dentry)) {
1282                 err = PTR_ERR(dentry);
1283                 goto out;
1284         }
1285 
1286         /*
1287          * All right, let's create it.
1288          */
1289         idmap = mnt_idmap(parent.mnt);
1290         err = security_path_mknod(&parent, dentry, mode, 0);
1291         if (!err)
1292                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1293         if (err)
1294                 goto out_path;
1295         err = mutex_lock_interruptible(&u->bindlock);
1296         if (err)
1297                 goto out_unlink;
1298         if (u->addr)
1299                 goto out_unlock;
1300 
1301         old_hash = sk->sk_hash;
1302         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1303         unix_table_double_lock(net, old_hash, new_hash);
1304         u->path.mnt = mntget(parent.mnt);
1305         u->path.dentry = dget(dentry);
1306         __unix_set_addr_hash(net, sk, addr, new_hash);
1307         unix_table_double_unlock(net, old_hash, new_hash);
1308         unix_insert_bsd_socket(sk);
1309         mutex_unlock(&u->bindlock);
1310         done_path_create(&parent, dentry);
1311         return 0;
1312 
1313 out_unlock:
1314         mutex_unlock(&u->bindlock);
1315         err = -EINVAL;
1316 out_unlink:
1317         /* failed after successful mknod?  unlink what we'd created... */
1318         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1319 out_path:
1320         done_path_create(&parent, dentry);
1321 out:
1322         unix_release_addr(addr);
1323         return err == -EEXIST ? -EADDRINUSE : err;
1324 }
1325 
1326 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1327                               int addr_len)
1328 {
1329         struct unix_sock *u = unix_sk(sk);
1330         unsigned int new_hash, old_hash;
1331         struct net *net = sock_net(sk);
1332         struct unix_address *addr;
1333         int err;
1334 
1335         addr = unix_create_addr(sunaddr, addr_len);
1336         if (!addr)
1337                 return -ENOMEM;
1338 
1339         err = mutex_lock_interruptible(&u->bindlock);
1340         if (err)
1341                 goto out;
1342 
1343         if (u->addr) {
1344                 err = -EINVAL;
1345                 goto out_mutex;
1346         }
1347 
1348         old_hash = sk->sk_hash;
1349         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1350         unix_table_double_lock(net, old_hash, new_hash);
1351 
1352         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1353                 goto out_spin;
1354 
1355         __unix_set_addr_hash(net, sk, addr, new_hash);
1356         unix_table_double_unlock(net, old_hash, new_hash);
1357         mutex_unlock(&u->bindlock);
1358         return 0;
1359 
1360 out_spin:
1361         unix_table_double_unlock(net, old_hash, new_hash);
1362         err = -EADDRINUSE;
1363 out_mutex:
1364         mutex_unlock(&u->bindlock);
1365 out:
1366         unix_release_addr(addr);
1367         return err;
1368 }
1369 
1370 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1371 {
1372         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1373         struct sock *sk = sock->sk;
1374         int err;
1375 
1376         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1377             sunaddr->sun_family == AF_UNIX)
1378                 return unix_autobind(sk);
1379 
1380         err = unix_validate_addr(sunaddr, addr_len);
1381         if (err)
1382                 return err;
1383 
1384         if (sunaddr->sun_path[0])
1385                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1386         else
1387                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1388 
1389         return err;
1390 }
1391 
1392 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1393 {
1394         if (unlikely(sk1 == sk2) || !sk2) {
1395                 unix_state_lock(sk1);
1396                 return;
1397         }
1398 
1399         if (sk1 > sk2)
1400                 swap(sk1, sk2);
1401 
1402         unix_state_lock(sk1);
1403         unix_state_lock(sk2);
1404 }
1405 
1406 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1407 {
1408         if (unlikely(sk1 == sk2) || !sk2) {
1409                 unix_state_unlock(sk1);
1410                 return;
1411         }
1412         unix_state_unlock(sk1);
1413         unix_state_unlock(sk2);
1414 }
1415 
1416 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1417                               int alen, int flags)
1418 {
1419         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1420         struct sock *sk = sock->sk;
1421         struct sock *other;
1422         int err;
1423 
1424         err = -EINVAL;
1425         if (alen < offsetofend(struct sockaddr, sa_family))
1426                 goto out;
1427 
1428         if (addr->sa_family != AF_UNSPEC) {
1429                 err = unix_validate_addr(sunaddr, alen);
1430                 if (err)
1431                         goto out;
1432 
1433                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1434                 if (err)
1435                         goto out;
1436 
1437                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1438                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1439                     !READ_ONCE(unix_sk(sk)->addr)) {
1440                         err = unix_autobind(sk);
1441                         if (err)
1442                                 goto out;
1443                 }
1444 
1445 restart:
1446                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1447                 if (IS_ERR(other)) {
1448                         err = PTR_ERR(other);
1449                         goto out;
1450                 }
1451 
1452                 unix_state_double_lock(sk, other);
1453 
1454                 /* Apparently VFS overslept socket death. Retry. */
1455                 if (sock_flag(other, SOCK_DEAD)) {
1456                         unix_state_double_unlock(sk, other);
1457                         sock_put(other);
1458                         goto restart;
1459                 }
1460 
1461                 err = -EPERM;
1462                 if (!unix_may_send(sk, other))
1463                         goto out_unlock;
1464 
1465                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1466                 if (err)
1467                         goto out_unlock;
1468 
1469                 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1470                 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1471         } else {
1472                 /*
1473                  *      1003.1g breaking connected state with AF_UNSPEC
1474                  */
1475                 other = NULL;
1476                 unix_state_double_lock(sk, other);
1477         }
1478 
1479         /*
1480          * If it was connected, reconnect.
1481          */
1482         if (unix_peer(sk)) {
1483                 struct sock *old_peer = unix_peer(sk);
1484 
1485                 unix_peer(sk) = other;
1486                 if (!other)
1487                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1488                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1489 
1490                 unix_state_double_unlock(sk, other);
1491 
1492                 if (other != old_peer) {
1493                         unix_dgram_disconnected(sk, old_peer);
1494 
1495                         unix_state_lock(old_peer);
1496                         if (!unix_peer(old_peer))
1497                                 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1498                         unix_state_unlock(old_peer);
1499                 }
1500 
1501                 sock_put(old_peer);
1502         } else {
1503                 unix_peer(sk) = other;
1504                 unix_state_double_unlock(sk, other);
1505         }
1506 
1507         return 0;
1508 
1509 out_unlock:
1510         unix_state_double_unlock(sk, other);
1511         sock_put(other);
1512 out:
1513         return err;
1514 }
1515 
1516 static long unix_wait_for_peer(struct sock *other, long timeo)
1517         __releases(&unix_sk(other)->lock)
1518 {
1519         struct unix_sock *u = unix_sk(other);
1520         int sched;
1521         DEFINE_WAIT(wait);
1522 
1523         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1524 
1525         sched = !sock_flag(other, SOCK_DEAD) &&
1526                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1527                 unix_recvq_full_lockless(other);
1528 
1529         unix_state_unlock(other);
1530 
1531         if (sched)
1532                 timeo = schedule_timeout(timeo);
1533 
1534         finish_wait(&u->peer_wait, &wait);
1535         return timeo;
1536 }
1537 
1538 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1539                                int addr_len, int flags)
1540 {
1541         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1542         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1543         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1544         struct net *net = sock_net(sk);
1545         struct sk_buff *skb = NULL;
1546         unsigned char state;
1547         long timeo;
1548         int err;
1549 
1550         err = unix_validate_addr(sunaddr, addr_len);
1551         if (err)
1552                 goto out;
1553 
1554         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1555         if (err)
1556                 goto out;
1557 
1558         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1559              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1560             !READ_ONCE(u->addr)) {
1561                 err = unix_autobind(sk);
1562                 if (err)
1563                         goto out;
1564         }
1565 
1566         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1567 
1568         /* First of all allocate resources.
1569            If we will make it after state is locked,
1570            we will have to recheck all again in any case.
1571          */
1572 
1573         /* create new sock for complete connection */
1574         newsk = unix_create1(net, NULL, 0, sock->type);
1575         if (IS_ERR(newsk)) {
1576                 err = PTR_ERR(newsk);
1577                 newsk = NULL;
1578                 goto out;
1579         }
1580 
1581         err = -ENOMEM;
1582 
1583         /* Allocate skb for sending to listening sock */
1584         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1585         if (skb == NULL)
1586                 goto out;
1587 
1588 restart:
1589         /*  Find listening sock. */
1590         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1591         if (IS_ERR(other)) {
1592                 err = PTR_ERR(other);
1593                 other = NULL;
1594                 goto out;
1595         }
1596 
1597         unix_state_lock(other);
1598 
1599         /* Apparently VFS overslept socket death. Retry. */
1600         if (sock_flag(other, SOCK_DEAD)) {
1601                 unix_state_unlock(other);
1602                 sock_put(other);
1603                 goto restart;
1604         }
1605 
1606         err = -ECONNREFUSED;
1607         if (other->sk_state != TCP_LISTEN)
1608                 goto out_unlock;
1609         if (other->sk_shutdown & RCV_SHUTDOWN)
1610                 goto out_unlock;
1611 
1612         if (unix_recvq_full_lockless(other)) {
1613                 err = -EAGAIN;
1614                 if (!timeo)
1615                         goto out_unlock;
1616 
1617                 timeo = unix_wait_for_peer(other, timeo);
1618 
1619                 err = sock_intr_errno(timeo);
1620                 if (signal_pending(current))
1621                         goto out;
1622                 sock_put(other);
1623                 goto restart;
1624         }
1625 
1626         /* self connect and simultaneous connect are eliminated
1627          * by rejecting TCP_LISTEN socket to avoid deadlock.
1628          */
1629         state = READ_ONCE(sk->sk_state);
1630         if (unlikely(state != TCP_CLOSE)) {
1631                 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1632                 goto out_unlock;
1633         }
1634 
1635         unix_state_lock(sk);
1636 
1637         if (unlikely(sk->sk_state != TCP_CLOSE)) {
1638                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1639                 unix_state_unlock(sk);
1640                 goto out_unlock;
1641         }
1642 
1643         err = security_unix_stream_connect(sk, other, newsk);
1644         if (err) {
1645                 unix_state_unlock(sk);
1646                 goto out_unlock;
1647         }
1648 
1649         /* The way is open! Fastly set all the necessary fields... */
1650 
1651         sock_hold(sk);
1652         unix_peer(newsk)        = sk;
1653         newsk->sk_state         = TCP_ESTABLISHED;
1654         newsk->sk_type          = sk->sk_type;
1655         init_peercred(newsk);
1656         newu = unix_sk(newsk);
1657         newu->listener = other;
1658         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1659         otheru = unix_sk(other);
1660 
1661         /* copy address information from listening to new sock
1662          *
1663          * The contents of *(otheru->addr) and otheru->path
1664          * are seen fully set up here, since we have found
1665          * otheru in hash under its lock.  Insertion into the
1666          * hash chain we'd found it in had been done in an
1667          * earlier critical area protected by the chain's lock,
1668          * the same one where we'd set *(otheru->addr) contents,
1669          * as well as otheru->path and otheru->addr itself.
1670          *
1671          * Using smp_store_release() here to set newu->addr
1672          * is enough to make those stores, as well as stores
1673          * to newu->path visible to anyone who gets newu->addr
1674          * by smp_load_acquire().  IOW, the same warranties
1675          * as for unix_sock instances bound in unix_bind() or
1676          * in unix_autobind().
1677          */
1678         if (otheru->path.dentry) {
1679                 path_get(&otheru->path);
1680                 newu->path = otheru->path;
1681         }
1682         refcount_inc(&otheru->addr->refcnt);
1683         smp_store_release(&newu->addr, otheru->addr);
1684 
1685         /* Set credentials */
1686         copy_peercred(sk, other);
1687 
1688         sock->state     = SS_CONNECTED;
1689         WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1690         sock_hold(newsk);
1691 
1692         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1693         unix_peer(sk)   = newsk;
1694 
1695         unix_state_unlock(sk);
1696 
1697         /* take ten and send info to listening sock */
1698         spin_lock(&other->sk_receive_queue.lock);
1699         __skb_queue_tail(&other->sk_receive_queue, skb);
1700         spin_unlock(&other->sk_receive_queue.lock);
1701         unix_state_unlock(other);
1702         other->sk_data_ready(other);
1703         sock_put(other);
1704         return 0;
1705 
1706 out_unlock:
1707         if (other)
1708                 unix_state_unlock(other);
1709 
1710 out:
1711         kfree_skb(skb);
1712         if (newsk)
1713                 unix_release_sock(newsk, 0);
1714         if (other)
1715                 sock_put(other);
1716         return err;
1717 }
1718 
1719 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1720 {
1721         struct sock *ska = socka->sk, *skb = sockb->sk;
1722 
1723         /* Join our sockets back to back */
1724         sock_hold(ska);
1725         sock_hold(skb);
1726         unix_peer(ska) = skb;
1727         unix_peer(skb) = ska;
1728         init_peercred(ska);
1729         init_peercred(skb);
1730 
1731         ska->sk_state = TCP_ESTABLISHED;
1732         skb->sk_state = TCP_ESTABLISHED;
1733         socka->state  = SS_CONNECTED;
1734         sockb->state  = SS_CONNECTED;
1735         return 0;
1736 }
1737 
1738 static void unix_sock_inherit_flags(const struct socket *old,
1739                                     struct socket *new)
1740 {
1741         if (test_bit(SOCK_PASSCRED, &old->flags))
1742                 set_bit(SOCK_PASSCRED, &new->flags);
1743         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1744                 set_bit(SOCK_PASSPIDFD, &new->flags);
1745         if (test_bit(SOCK_PASSSEC, &old->flags))
1746                 set_bit(SOCK_PASSSEC, &new->flags);
1747 }
1748 
1749 static int unix_accept(struct socket *sock, struct socket *newsock,
1750                        struct proto_accept_arg *arg)
1751 {
1752         struct sock *sk = sock->sk;
1753         struct sk_buff *skb;
1754         struct sock *tsk;
1755 
1756         arg->err = -EOPNOTSUPP;
1757         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1758                 goto out;
1759 
1760         arg->err = -EINVAL;
1761         if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1762                 goto out;
1763 
1764         /* If socket state is TCP_LISTEN it cannot change (for now...),
1765          * so that no locks are necessary.
1766          */
1767 
1768         skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1769                                 &arg->err);
1770         if (!skb) {
1771                 /* This means receive shutdown. */
1772                 if (arg->err == 0)
1773                         arg->err = -EINVAL;
1774                 goto out;
1775         }
1776 
1777         tsk = skb->sk;
1778         skb_free_datagram(sk, skb);
1779         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1780 
1781         /* attach accepted sock to socket */
1782         unix_state_lock(tsk);
1783         unix_update_edges(unix_sk(tsk));
1784         newsock->state = SS_CONNECTED;
1785         unix_sock_inherit_flags(sock, newsock);
1786         sock_graft(tsk, newsock);
1787         unix_state_unlock(tsk);
1788         return 0;
1789 
1790 out:
1791         return arg->err;
1792 }
1793 
1794 
1795 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1796 {
1797         struct sock *sk = sock->sk;
1798         struct unix_address *addr;
1799         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1800         int err = 0;
1801 
1802         if (peer) {
1803                 sk = unix_peer_get(sk);
1804 
1805                 err = -ENOTCONN;
1806                 if (!sk)
1807                         goto out;
1808                 err = 0;
1809         } else {
1810                 sock_hold(sk);
1811         }
1812 
1813         addr = smp_load_acquire(&unix_sk(sk)->addr);
1814         if (!addr) {
1815                 sunaddr->sun_family = AF_UNIX;
1816                 sunaddr->sun_path[0] = 0;
1817                 err = offsetof(struct sockaddr_un, sun_path);
1818         } else {
1819                 err = addr->len;
1820                 memcpy(sunaddr, addr->name, addr->len);
1821 
1822                 if (peer)
1823                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1824                                                CGROUP_UNIX_GETPEERNAME);
1825                 else
1826                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1827                                                CGROUP_UNIX_GETSOCKNAME);
1828         }
1829         sock_put(sk);
1830 out:
1831         return err;
1832 }
1833 
1834 /* The "user->unix_inflight" variable is protected by the garbage
1835  * collection lock, and we just read it locklessly here. If you go
1836  * over the limit, there might be a tiny race in actually noticing
1837  * it across threads. Tough.
1838  */
1839 static inline bool too_many_unix_fds(struct task_struct *p)
1840 {
1841         struct user_struct *user = current_user();
1842 
1843         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1844                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1845         return false;
1846 }
1847 
1848 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1849 {
1850         if (too_many_unix_fds(current))
1851                 return -ETOOMANYREFS;
1852 
1853         UNIXCB(skb).fp = scm->fp;
1854         scm->fp = NULL;
1855 
1856         if (unix_prepare_fpl(UNIXCB(skb).fp))
1857                 return -ENOMEM;
1858 
1859         return 0;
1860 }
1861 
1862 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1863 {
1864         scm->fp = UNIXCB(skb).fp;
1865         UNIXCB(skb).fp = NULL;
1866 
1867         unix_destroy_fpl(scm->fp);
1868 }
1869 
1870 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1871 {
1872         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1873 }
1874 
1875 static void unix_destruct_scm(struct sk_buff *skb)
1876 {
1877         struct scm_cookie scm;
1878 
1879         memset(&scm, 0, sizeof(scm));
1880         scm.pid  = UNIXCB(skb).pid;
1881         if (UNIXCB(skb).fp)
1882                 unix_detach_fds(&scm, skb);
1883 
1884         /* Alas, it calls VFS */
1885         /* So fscking what? fput() had been SMP-safe since the last Summer */
1886         scm_destroy(&scm);
1887         sock_wfree(skb);
1888 }
1889 
1890 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1891 {
1892         int err = 0;
1893 
1894         UNIXCB(skb).pid  = get_pid(scm->pid);
1895         UNIXCB(skb).uid = scm->creds.uid;
1896         UNIXCB(skb).gid = scm->creds.gid;
1897         UNIXCB(skb).fp = NULL;
1898         unix_get_secdata(scm, skb);
1899         if (scm->fp && send_fds)
1900                 err = unix_attach_fds(scm, skb);
1901 
1902         skb->destructor = unix_destruct_scm;
1903         return err;
1904 }
1905 
1906 static bool unix_passcred_enabled(const struct socket *sock,
1907                                   const struct sock *other)
1908 {
1909         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1910                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1911                !other->sk_socket ||
1912                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1913                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1914 }
1915 
1916 /*
1917  * Some apps rely on write() giving SCM_CREDENTIALS
1918  * We include credentials if source or destination socket
1919  * asserted SOCK_PASSCRED.
1920  */
1921 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1922                             const struct sock *other)
1923 {
1924         if (UNIXCB(skb).pid)
1925                 return;
1926         if (unix_passcred_enabled(sock, other)) {
1927                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1928                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1929         }
1930 }
1931 
1932 static bool unix_skb_scm_eq(struct sk_buff *skb,
1933                             struct scm_cookie *scm)
1934 {
1935         return UNIXCB(skb).pid == scm->pid &&
1936                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1937                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1938                unix_secdata_eq(scm, skb);
1939 }
1940 
1941 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1942 {
1943         struct scm_fp_list *fp = UNIXCB(skb).fp;
1944         struct unix_sock *u = unix_sk(sk);
1945 
1946         if (unlikely(fp && fp->count)) {
1947                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1948                 unix_add_edges(fp, u);
1949         }
1950 }
1951 
1952 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1953 {
1954         struct scm_fp_list *fp = UNIXCB(skb).fp;
1955         struct unix_sock *u = unix_sk(sk);
1956 
1957         if (unlikely(fp && fp->count)) {
1958                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1959                 unix_del_edges(fp);
1960         }
1961 }
1962 
1963 /*
1964  *      Send AF_UNIX data.
1965  */
1966 
1967 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1968                               size_t len)
1969 {
1970         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1971         struct sock *sk = sock->sk, *other = NULL;
1972         struct unix_sock *u = unix_sk(sk);
1973         struct scm_cookie scm;
1974         struct sk_buff *skb;
1975         int data_len = 0;
1976         int sk_locked;
1977         long timeo;
1978         int err;
1979 
1980         err = scm_send(sock, msg, &scm, false);
1981         if (err < 0)
1982                 return err;
1983 
1984         wait_for_unix_gc(scm.fp);
1985 
1986         err = -EOPNOTSUPP;
1987         if (msg->msg_flags&MSG_OOB)
1988                 goto out;
1989 
1990         if (msg->msg_namelen) {
1991                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1992                 if (err)
1993                         goto out;
1994 
1995                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1996                                                             msg->msg_name,
1997                                                             &msg->msg_namelen,
1998                                                             NULL);
1999                 if (err)
2000                         goto out;
2001         } else {
2002                 sunaddr = NULL;
2003                 err = -ENOTCONN;
2004                 other = unix_peer_get(sk);
2005                 if (!other)
2006                         goto out;
2007         }
2008 
2009         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2010              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2011             !READ_ONCE(u->addr)) {
2012                 err = unix_autobind(sk);
2013                 if (err)
2014                         goto out;
2015         }
2016 
2017         err = -EMSGSIZE;
2018         if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2019                 goto out;
2020 
2021         if (len > SKB_MAX_ALLOC) {
2022                 data_len = min_t(size_t,
2023                                  len - SKB_MAX_ALLOC,
2024                                  MAX_SKB_FRAGS * PAGE_SIZE);
2025                 data_len = PAGE_ALIGN(data_len);
2026 
2027                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2028         }
2029 
2030         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2031                                    msg->msg_flags & MSG_DONTWAIT, &err,
2032                                    PAGE_ALLOC_COSTLY_ORDER);
2033         if (skb == NULL)
2034                 goto out;
2035 
2036         err = unix_scm_to_skb(&scm, skb, true);
2037         if (err < 0)
2038                 goto out_free;
2039 
2040         skb_put(skb, len - data_len);
2041         skb->data_len = data_len;
2042         skb->len = len;
2043         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2044         if (err)
2045                 goto out_free;
2046 
2047         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2048 
2049 restart:
2050         if (!other) {
2051                 err = -ECONNRESET;
2052                 if (sunaddr == NULL)
2053                         goto out_free;
2054 
2055                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2056                                         sk->sk_type);
2057                 if (IS_ERR(other)) {
2058                         err = PTR_ERR(other);
2059                         other = NULL;
2060                         goto out_free;
2061                 }
2062         }
2063 
2064         if (sk_filter(other, skb) < 0) {
2065                 /* Toss the packet but do not return any error to the sender */
2066                 err = len;
2067                 goto out_free;
2068         }
2069 
2070         sk_locked = 0;
2071         unix_state_lock(other);
2072 restart_locked:
2073         err = -EPERM;
2074         if (!unix_may_send(sk, other))
2075                 goto out_unlock;
2076 
2077         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2078                 /*
2079                  *      Check with 1003.1g - what should
2080                  *      datagram error
2081                  */
2082                 unix_state_unlock(other);
2083                 sock_put(other);
2084 
2085                 if (!sk_locked)
2086                         unix_state_lock(sk);
2087 
2088                 err = 0;
2089                 if (sk->sk_type == SOCK_SEQPACKET) {
2090                         /* We are here only when racing with unix_release_sock()
2091                          * is clearing @other. Never change state to TCP_CLOSE
2092                          * unlike SOCK_DGRAM wants.
2093                          */
2094                         unix_state_unlock(sk);
2095                         err = -EPIPE;
2096                 } else if (unix_peer(sk) == other) {
2097                         unix_peer(sk) = NULL;
2098                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2099 
2100                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2101                         unix_state_unlock(sk);
2102 
2103                         unix_dgram_disconnected(sk, other);
2104                         sock_put(other);
2105                         err = -ECONNREFUSED;
2106                 } else {
2107                         unix_state_unlock(sk);
2108                 }
2109 
2110                 other = NULL;
2111                 if (err)
2112                         goto out_free;
2113                 goto restart;
2114         }
2115 
2116         err = -EPIPE;
2117         if (other->sk_shutdown & RCV_SHUTDOWN)
2118                 goto out_unlock;
2119 
2120         if (sk->sk_type != SOCK_SEQPACKET) {
2121                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2122                 if (err)
2123                         goto out_unlock;
2124         }
2125 
2126         /* other == sk && unix_peer(other) != sk if
2127          * - unix_peer(sk) == NULL, destination address bound to sk
2128          * - unix_peer(sk) == sk by time of get but disconnected before lock
2129          */
2130         if (other != sk &&
2131             unlikely(unix_peer(other) != sk &&
2132             unix_recvq_full_lockless(other))) {
2133                 if (timeo) {
2134                         timeo = unix_wait_for_peer(other, timeo);
2135 
2136                         err = sock_intr_errno(timeo);
2137                         if (signal_pending(current))
2138                                 goto out_free;
2139 
2140                         goto restart;
2141                 }
2142 
2143                 if (!sk_locked) {
2144                         unix_state_unlock(other);
2145                         unix_state_double_lock(sk, other);
2146                 }
2147 
2148                 if (unix_peer(sk) != other ||
2149                     unix_dgram_peer_wake_me(sk, other)) {
2150                         err = -EAGAIN;
2151                         sk_locked = 1;
2152                         goto out_unlock;
2153                 }
2154 
2155                 if (!sk_locked) {
2156                         sk_locked = 1;
2157                         goto restart_locked;
2158                 }
2159         }
2160 
2161         if (unlikely(sk_locked))
2162                 unix_state_unlock(sk);
2163 
2164         if (sock_flag(other, SOCK_RCVTSTAMP))
2165                 __net_timestamp(skb);
2166         maybe_add_creds(skb, sock, other);
2167         scm_stat_add(other, skb);
2168         skb_queue_tail(&other->sk_receive_queue, skb);
2169         unix_state_unlock(other);
2170         other->sk_data_ready(other);
2171         sock_put(other);
2172         scm_destroy(&scm);
2173         return len;
2174 
2175 out_unlock:
2176         if (sk_locked)
2177                 unix_state_unlock(sk);
2178         unix_state_unlock(other);
2179 out_free:
2180         kfree_skb(skb);
2181 out:
2182         if (other)
2183                 sock_put(other);
2184         scm_destroy(&scm);
2185         return err;
2186 }
2187 
2188 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2189  * bytes, and a minimum of a full page.
2190  */
2191 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2192 
2193 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2194 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2195                      struct scm_cookie *scm, bool fds_sent)
2196 {
2197         struct unix_sock *ousk = unix_sk(other);
2198         struct sk_buff *skb;
2199         int err = 0;
2200 
2201         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2202 
2203         if (!skb)
2204                 return err;
2205 
2206         err = unix_scm_to_skb(scm, skb, !fds_sent);
2207         if (err < 0) {
2208                 kfree_skb(skb);
2209                 return err;
2210         }
2211         skb_put(skb, 1);
2212         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2213 
2214         if (err) {
2215                 kfree_skb(skb);
2216                 return err;
2217         }
2218 
2219         unix_state_lock(other);
2220 
2221         if (sock_flag(other, SOCK_DEAD) ||
2222             (other->sk_shutdown & RCV_SHUTDOWN)) {
2223                 unix_state_unlock(other);
2224                 kfree_skb(skb);
2225                 return -EPIPE;
2226         }
2227 
2228         maybe_add_creds(skb, sock, other);
2229         skb_get(skb);
2230 
2231         scm_stat_add(other, skb);
2232 
2233         spin_lock(&other->sk_receive_queue.lock);
2234         if (ousk->oob_skb)
2235                 consume_skb(ousk->oob_skb);
2236         WRITE_ONCE(ousk->oob_skb, skb);
2237         __skb_queue_tail(&other->sk_receive_queue, skb);
2238         spin_unlock(&other->sk_receive_queue.lock);
2239 
2240         sk_send_sigurg(other);
2241         unix_state_unlock(other);
2242         other->sk_data_ready(other);
2243 
2244         return err;
2245 }
2246 #endif
2247 
2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2249                                size_t len)
2250 {
2251         struct sock *sk = sock->sk;
2252         struct sock *other = NULL;
2253         int err, size;
2254         struct sk_buff *skb;
2255         int sent = 0;
2256         struct scm_cookie scm;
2257         bool fds_sent = false;
2258         int data_len;
2259 
2260         err = scm_send(sock, msg, &scm, false);
2261         if (err < 0)
2262                 return err;
2263 
2264         wait_for_unix_gc(scm.fp);
2265 
2266         err = -EOPNOTSUPP;
2267         if (msg->msg_flags & MSG_OOB) {
2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2269                 if (len)
2270                         len--;
2271                 else
2272 #endif
2273                         goto out_err;
2274         }
2275 
2276         if (msg->msg_namelen) {
2277                 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2278                 goto out_err;
2279         } else {
2280                 err = -ENOTCONN;
2281                 other = unix_peer(sk);
2282                 if (!other)
2283                         goto out_err;
2284         }
2285 
2286         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2287                 goto pipe_err;
2288 
2289         while (sent < len) {
2290                 size = len - sent;
2291 
2292                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2293                         skb = sock_alloc_send_pskb(sk, 0, 0,
2294                                                    msg->msg_flags & MSG_DONTWAIT,
2295                                                    &err, 0);
2296                 } else {
2297                         /* Keep two messages in the pipe so it schedules better */
2298                         size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2299 
2300                         /* allow fallback to order-0 allocations */
2301                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2302 
2303                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2304 
2305                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2306 
2307                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2308                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2309                                                    get_order(UNIX_SKB_FRAGS_SZ));
2310                 }
2311                 if (!skb)
2312                         goto out_err;
2313 
2314                 /* Only send the fds in the first buffer */
2315                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2316                 if (err < 0) {
2317                         kfree_skb(skb);
2318                         goto out_err;
2319                 }
2320                 fds_sent = true;
2321 
2322                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2323                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2324                                                    sk->sk_allocation);
2325                         if (err < 0) {
2326                                 kfree_skb(skb);
2327                                 goto out_err;
2328                         }
2329                         size = err;
2330                         refcount_add(size, &sk->sk_wmem_alloc);
2331                 } else {
2332                         skb_put(skb, size - data_len);
2333                         skb->data_len = data_len;
2334                         skb->len = size;
2335                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2336                         if (err) {
2337                                 kfree_skb(skb);
2338                                 goto out_err;
2339                         }
2340                 }
2341 
2342                 unix_state_lock(other);
2343 
2344                 if (sock_flag(other, SOCK_DEAD) ||
2345                     (other->sk_shutdown & RCV_SHUTDOWN))
2346                         goto pipe_err_free;
2347 
2348                 maybe_add_creds(skb, sock, other);
2349                 scm_stat_add(other, skb);
2350                 skb_queue_tail(&other->sk_receive_queue, skb);
2351                 unix_state_unlock(other);
2352                 other->sk_data_ready(other);
2353                 sent += size;
2354         }
2355 
2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2357         if (msg->msg_flags & MSG_OOB) {
2358                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2359                 if (err)
2360                         goto out_err;
2361                 sent++;
2362         }
2363 #endif
2364 
2365         scm_destroy(&scm);
2366 
2367         return sent;
2368 
2369 pipe_err_free:
2370         unix_state_unlock(other);
2371         kfree_skb(skb);
2372 pipe_err:
2373         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2374                 send_sig(SIGPIPE, current, 0);
2375         err = -EPIPE;
2376 out_err:
2377         scm_destroy(&scm);
2378         return sent ? : err;
2379 }
2380 
2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2382                                   size_t len)
2383 {
2384         int err;
2385         struct sock *sk = sock->sk;
2386 
2387         err = sock_error(sk);
2388         if (err)
2389                 return err;
2390 
2391         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2392                 return -ENOTCONN;
2393 
2394         if (msg->msg_namelen)
2395                 msg->msg_namelen = 0;
2396 
2397         return unix_dgram_sendmsg(sock, msg, len);
2398 }
2399 
2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2401                                   size_t size, int flags)
2402 {
2403         struct sock *sk = sock->sk;
2404 
2405         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2406                 return -ENOTCONN;
2407 
2408         return unix_dgram_recvmsg(sock, msg, size, flags);
2409 }
2410 
2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2412 {
2413         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2414 
2415         if (addr) {
2416                 msg->msg_namelen = addr->len;
2417                 memcpy(msg->msg_name, addr->name, addr->len);
2418         }
2419 }
2420 
2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2422                          int flags)
2423 {
2424         struct scm_cookie scm;
2425         struct socket *sock = sk->sk_socket;
2426         struct unix_sock *u = unix_sk(sk);
2427         struct sk_buff *skb, *last;
2428         long timeo;
2429         int skip;
2430         int err;
2431 
2432         err = -EOPNOTSUPP;
2433         if (flags&MSG_OOB)
2434                 goto out;
2435 
2436         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2437 
2438         do {
2439                 mutex_lock(&u->iolock);
2440 
2441                 skip = sk_peek_offset(sk, flags);
2442                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2443                                               &skip, &err, &last);
2444                 if (skb) {
2445                         if (!(flags & MSG_PEEK))
2446                                 scm_stat_del(sk, skb);
2447                         break;
2448                 }
2449 
2450                 mutex_unlock(&u->iolock);
2451 
2452                 if (err != -EAGAIN)
2453                         break;
2454         } while (timeo &&
2455                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2456                                               &err, &timeo, last));
2457 
2458         if (!skb) { /* implies iolock unlocked */
2459                 unix_state_lock(sk);
2460                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2461                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2462                     (sk->sk_shutdown & RCV_SHUTDOWN))
2463                         err = 0;
2464                 unix_state_unlock(sk);
2465                 goto out;
2466         }
2467 
2468         if (wq_has_sleeper(&u->peer_wait))
2469                 wake_up_interruptible_sync_poll(&u->peer_wait,
2470                                                 EPOLLOUT | EPOLLWRNORM |
2471                                                 EPOLLWRBAND);
2472 
2473         if (ccs_socket_post_recvmsg_permission(sk, skb, flags)) {
2474                 err = -EAGAIN; /* Hope less harmful than -EPERM. */
2475                 goto out_unlock;
2476         }
2477         if (msg->msg_name) {
2478                 unix_copy_addr(msg, skb->sk);
2479 
2480                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2481                                                       msg->msg_name,
2482                                                       &msg->msg_namelen);
2483         }
2484 
2485         if (size > skb->len - skip)
2486                 size = skb->len - skip;
2487         else if (size < skb->len - skip)
2488                 msg->msg_flags |= MSG_TRUNC;
2489 
2490         err = skb_copy_datagram_msg(skb, skip, msg, size);
2491         if (err)
2492                 goto out_free;
2493 
2494         if (sock_flag(sk, SOCK_RCVTSTAMP))
2495                 __sock_recv_timestamp(msg, sk, skb);
2496 
2497         memset(&scm, 0, sizeof(scm));
2498 
2499         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2500         unix_set_secdata(&scm, skb);
2501 
2502         if (!(flags & MSG_PEEK)) {
2503                 if (UNIXCB(skb).fp)
2504                         unix_detach_fds(&scm, skb);
2505 
2506                 sk_peek_offset_bwd(sk, skb->len);
2507         } else {
2508                 /* It is questionable: on PEEK we could:
2509                    - do not return fds - good, but too simple 8)
2510                    - return fds, and do not return them on read (old strategy,
2511                      apparently wrong)
2512                    - clone fds (I chose it for now, it is the most universal
2513                      solution)
2514 
2515                    POSIX 1003.1g does not actually define this clearly
2516                    at all. POSIX 1003.1g doesn't define a lot of things
2517                    clearly however!
2518 
2519                 */
2520 
2521                 sk_peek_offset_fwd(sk, size);
2522 
2523                 if (UNIXCB(skb).fp)
2524                         unix_peek_fds(&scm, skb);
2525         }
2526         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2527 
2528         scm_recv_unix(sock, msg, &scm, flags);
2529 
2530 out_free:
2531         skb_free_datagram(sk, skb);
2532 out_unlock:
2533         mutex_unlock(&u->iolock);
2534 out:
2535         return err;
2536 }
2537 
2538 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2539                               int flags)
2540 {
2541         struct sock *sk = sock->sk;
2542 
2543 #ifdef CONFIG_BPF_SYSCALL
2544         const struct proto *prot = READ_ONCE(sk->sk_prot);
2545 
2546         if (prot != &unix_dgram_proto)
2547                 return prot->recvmsg(sk, msg, size, flags, NULL);
2548 #endif
2549         return __unix_dgram_recvmsg(sk, msg, size, flags);
2550 }
2551 
2552 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2553 {
2554         struct unix_sock *u = unix_sk(sk);
2555         struct sk_buff *skb;
2556         int err;
2557 
2558         mutex_lock(&u->iolock);
2559         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2560         mutex_unlock(&u->iolock);
2561         if (!skb)
2562                 return err;
2563 
2564         return recv_actor(sk, skb);
2565 }
2566 
2567 /*
2568  *      Sleep until more data has arrived. But check for races..
2569  */
2570 static long unix_stream_data_wait(struct sock *sk, long timeo,
2571                                   struct sk_buff *last, unsigned int last_len,
2572                                   bool freezable)
2573 {
2574         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2575         struct sk_buff *tail;
2576         DEFINE_WAIT(wait);
2577 
2578         unix_state_lock(sk);
2579 
2580         for (;;) {
2581                 prepare_to_wait(sk_sleep(sk), &wait, state);
2582 
2583                 tail = skb_peek_tail(&sk->sk_receive_queue);
2584                 if (tail != last ||
2585                     (tail && tail->len != last_len) ||
2586                     sk->sk_err ||
2587                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2588                     signal_pending(current) ||
2589                     !timeo)
2590                         break;
2591 
2592                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2593                 unix_state_unlock(sk);
2594                 timeo = schedule_timeout(timeo);
2595                 unix_state_lock(sk);
2596 
2597                 if (sock_flag(sk, SOCK_DEAD))
2598                         break;
2599 
2600                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2601         }
2602 
2603         finish_wait(sk_sleep(sk), &wait);
2604         unix_state_unlock(sk);
2605         return timeo;
2606 }
2607 
2608 static unsigned int unix_skb_len(const struct sk_buff *skb)
2609 {
2610         return skb->len - UNIXCB(skb).consumed;
2611 }
2612 
2613 struct unix_stream_read_state {
2614         int (*recv_actor)(struct sk_buff *, int, int,
2615                           struct unix_stream_read_state *);
2616         struct socket *socket;
2617         struct msghdr *msg;
2618         struct pipe_inode_info *pipe;
2619         size_t size;
2620         int flags;
2621         unsigned int splice_flags;
2622 };
2623 
2624 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2625 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2626 {
2627         struct socket *sock = state->socket;
2628         struct sock *sk = sock->sk;
2629         struct unix_sock *u = unix_sk(sk);
2630         int chunk = 1;
2631         struct sk_buff *oob_skb;
2632 
2633         mutex_lock(&u->iolock);
2634         unix_state_lock(sk);
2635         spin_lock(&sk->sk_receive_queue.lock);
2636 
2637         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2638                 spin_unlock(&sk->sk_receive_queue.lock);
2639                 unix_state_unlock(sk);
2640                 mutex_unlock(&u->iolock);
2641                 return -EINVAL;
2642         }
2643 
2644         oob_skb = u->oob_skb;
2645 
2646         if (!(state->flags & MSG_PEEK))
2647                 WRITE_ONCE(u->oob_skb, NULL);
2648         else
2649                 skb_get(oob_skb);
2650 
2651         spin_unlock(&sk->sk_receive_queue.lock);
2652         unix_state_unlock(sk);
2653 
2654         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2655 
2656         if (!(state->flags & MSG_PEEK))
2657                 UNIXCB(oob_skb).consumed += 1;
2658 
2659         consume_skb(oob_skb);
2660 
2661         mutex_unlock(&u->iolock);
2662 
2663         if (chunk < 0)
2664                 return -EFAULT;
2665 
2666         state->msg->msg_flags |= MSG_OOB;
2667         return 1;
2668 }
2669 
2670 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2671                                   int flags, int copied)
2672 {
2673         struct unix_sock *u = unix_sk(sk);
2674 
2675         if (!unix_skb_len(skb)) {
2676                 struct sk_buff *unlinked_skb = NULL;
2677 
2678                 spin_lock(&sk->sk_receive_queue.lock);
2679 
2680                 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2681                         skb = NULL;
2682                 } else if (flags & MSG_PEEK) {
2683                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2684                 } else {
2685                         unlinked_skb = skb;
2686                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2687                         __skb_unlink(unlinked_skb, &sk->sk_receive_queue);
2688                 }
2689 
2690                 spin_unlock(&sk->sk_receive_queue.lock);
2691 
2692                 consume_skb(unlinked_skb);
2693         } else {
2694                 struct sk_buff *unlinked_skb = NULL;
2695 
2696                 spin_lock(&sk->sk_receive_queue.lock);
2697 
2698                 if (skb == u->oob_skb) {
2699                         if (copied) {
2700                                 skb = NULL;
2701                         } else if (!(flags & MSG_PEEK)) {
2702                                 if (sock_flag(sk, SOCK_URGINLINE)) {
2703                                         WRITE_ONCE(u->oob_skb, NULL);
2704                                         consume_skb(skb);
2705                                 } else {
2706                                         __skb_unlink(skb, &sk->sk_receive_queue);
2707                                         WRITE_ONCE(u->oob_skb, NULL);
2708                                         unlinked_skb = skb;
2709                                         skb = skb_peek(&sk->sk_receive_queue);
2710                                 }
2711                         } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2712                                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2713                         }
2714                 }
2715 
2716                 spin_unlock(&sk->sk_receive_queue.lock);
2717 
2718                 if (unlinked_skb) {
2719                         WARN_ON_ONCE(skb_unref(unlinked_skb));
2720                         kfree_skb(unlinked_skb);
2721                 }
2722         }
2723         return skb;
2724 }
2725 #endif
2726 
2727 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2728 {
2729         struct unix_sock *u = unix_sk(sk);
2730         struct sk_buff *skb;
2731         int err;
2732 
2733         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2734                 return -ENOTCONN;
2735 
2736         mutex_lock(&u->iolock);
2737         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2738         mutex_unlock(&u->iolock);
2739         if (!skb)
2740                 return err;
2741 
2742 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2743         if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2744                 bool drop = false;
2745 
2746                 unix_state_lock(sk);
2747 
2748                 if (sock_flag(sk, SOCK_DEAD)) {
2749                         unix_state_unlock(sk);
2750                         kfree_skb(skb);
2751                         return -ECONNRESET;
2752                 }
2753 
2754                 spin_lock(&sk->sk_receive_queue.lock);
2755                 if (likely(skb == u->oob_skb)) {
2756                         WRITE_ONCE(u->oob_skb, NULL);
2757                         drop = true;
2758                 }
2759                 spin_unlock(&sk->sk_receive_queue.lock);
2760 
2761                 unix_state_unlock(sk);
2762 
2763                 if (drop) {
2764                         WARN_ON_ONCE(skb_unref(skb));
2765                         kfree_skb(skb);
2766                         return -EAGAIN;
2767                 }
2768         }
2769 #endif
2770 
2771         return recv_actor(sk, skb);
2772 }
2773 
2774 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2775                                     bool freezable)
2776 {
2777         struct scm_cookie scm;
2778         struct socket *sock = state->socket;
2779         struct sock *sk = sock->sk;
2780         struct unix_sock *u = unix_sk(sk);
2781         int copied = 0;
2782         int flags = state->flags;
2783         int noblock = flags & MSG_DONTWAIT;
2784         bool check_creds = false;
2785         int target;
2786         int err = 0;
2787         long timeo;
2788         int skip;
2789         size_t size = state->size;
2790         unsigned int last_len;
2791 
2792         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2793                 err = -EINVAL;
2794                 goto out;
2795         }
2796 
2797         if (unlikely(flags & MSG_OOB)) {
2798                 err = -EOPNOTSUPP;
2799 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2800                 err = unix_stream_recv_urg(state);
2801 #endif
2802                 goto out;
2803         }
2804 
2805         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2806         timeo = sock_rcvtimeo(sk, noblock);
2807 
2808         memset(&scm, 0, sizeof(scm));
2809 
2810         /* Lock the socket to prevent queue disordering
2811          * while sleeps in memcpy_tomsg
2812          */
2813         mutex_lock(&u->iolock);
2814 
2815         skip = max(sk_peek_offset(sk, flags), 0);
2816 
2817         do {
2818                 struct sk_buff *skb, *last;
2819                 int chunk;
2820 
2821 redo:
2822                 unix_state_lock(sk);
2823                 if (sock_flag(sk, SOCK_DEAD)) {
2824                         err = -ECONNRESET;
2825                         goto unlock;
2826                 }
2827                 last = skb = skb_peek(&sk->sk_receive_queue);
2828                 last_len = last ? last->len : 0;
2829 
2830 again:
2831 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2832                 if (skb) {
2833                         skb = manage_oob(skb, sk, flags, copied);
2834                         if (!skb && copied) {
2835                                 unix_state_unlock(sk);
2836                                 break;
2837                         }
2838                 }
2839 #endif
2840                 if (skb == NULL) {
2841                         if (copied >= target)
2842                                 goto unlock;
2843 
2844                         /*
2845                          *      POSIX 1003.1g mandates this order.
2846                          */
2847 
2848                         err = sock_error(sk);
2849                         if (err)
2850                                 goto unlock;
2851                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2852                                 goto unlock;
2853 
2854                         unix_state_unlock(sk);
2855                         if (!timeo) {
2856                                 err = -EAGAIN;
2857                                 break;
2858                         }
2859 
2860                         mutex_unlock(&u->iolock);
2861 
2862                         timeo = unix_stream_data_wait(sk, timeo, last,
2863                                                       last_len, freezable);
2864 
2865                         if (signal_pending(current)) {
2866                                 err = sock_intr_errno(timeo);
2867                                 scm_destroy(&scm);
2868                                 goto out;
2869                         }
2870 
2871                         mutex_lock(&u->iolock);
2872                         goto redo;
2873 unlock:
2874                         unix_state_unlock(sk);
2875                         break;
2876                 }
2877 
2878                 while (skip >= unix_skb_len(skb)) {
2879                         skip -= unix_skb_len(skb);
2880                         last = skb;
2881                         last_len = skb->len;
2882                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2883                         if (!skb)
2884                                 goto again;
2885                 }
2886 
2887                 unix_state_unlock(sk);
2888 
2889                 if (check_creds) {
2890                         /* Never glue messages from different writers */
2891                         if (!unix_skb_scm_eq(skb, &scm))
2892                                 break;
2893                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2894                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2895                         /* Copy credentials */
2896                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2897                         unix_set_secdata(&scm, skb);
2898                         check_creds = true;
2899                 }
2900 
2901                 /* Copy address just once */
2902                 if (state->msg && state->msg->msg_name) {
2903                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2904                                          state->msg->msg_name);
2905                         unix_copy_addr(state->msg, skb->sk);
2906 
2907                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2908                                                               state->msg->msg_name,
2909                                                               &state->msg->msg_namelen);
2910 
2911                         sunaddr = NULL;
2912                 }
2913 
2914                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2915                 chunk = state->recv_actor(skb, skip, chunk, state);
2916                 if (chunk < 0) {
2917                         if (copied == 0)
2918                                 copied = -EFAULT;
2919                         break;
2920                 }
2921                 copied += chunk;
2922                 size -= chunk;
2923 
2924                 /* Mark read part of skb as used */
2925                 if (!(flags & MSG_PEEK)) {
2926                         UNIXCB(skb).consumed += chunk;
2927 
2928                         sk_peek_offset_bwd(sk, chunk);
2929 
2930                         if (UNIXCB(skb).fp) {
2931                                 scm_stat_del(sk, skb);
2932                                 unix_detach_fds(&scm, skb);
2933                         }
2934 
2935                         if (unix_skb_len(skb))
2936                                 break;
2937 
2938                         skb_unlink(skb, &sk->sk_receive_queue);
2939                         consume_skb(skb);
2940 
2941                         if (scm.fp)
2942                                 break;
2943                 } else {
2944                         /* It is questionable, see note in unix_dgram_recvmsg.
2945                          */
2946                         if (UNIXCB(skb).fp)
2947                                 unix_peek_fds(&scm, skb);
2948 
2949                         sk_peek_offset_fwd(sk, chunk);
2950 
2951                         if (UNIXCB(skb).fp)
2952                                 break;
2953 
2954                         skip = 0;
2955                         last = skb;
2956                         last_len = skb->len;
2957                         unix_state_lock(sk);
2958                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2959                         if (skb)
2960                                 goto again;
2961                         unix_state_unlock(sk);
2962                         break;
2963                 }
2964         } while (size);
2965 
2966         mutex_unlock(&u->iolock);
2967         if (state->msg)
2968                 scm_recv_unix(sock, state->msg, &scm, flags);
2969         else
2970                 scm_destroy(&scm);
2971 out:
2972         return copied ? : err;
2973 }
2974 
2975 static int unix_stream_read_actor(struct sk_buff *skb,
2976                                   int skip, int chunk,
2977                                   struct unix_stream_read_state *state)
2978 {
2979         int ret;
2980 
2981         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2982                                     state->msg, chunk);
2983         return ret ?: chunk;
2984 }
2985 
2986 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2987                           size_t size, int flags)
2988 {
2989         struct unix_stream_read_state state = {
2990                 .recv_actor = unix_stream_read_actor,
2991                 .socket = sk->sk_socket,
2992                 .msg = msg,
2993                 .size = size,
2994                 .flags = flags
2995         };
2996 
2997         return unix_stream_read_generic(&state, true);
2998 }
2999 
3000 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
3001                                size_t size, int flags)
3002 {
3003         struct unix_stream_read_state state = {
3004                 .recv_actor = unix_stream_read_actor,
3005                 .socket = sock,
3006                 .msg = msg,
3007                 .size = size,
3008                 .flags = flags
3009         };
3010 
3011 #ifdef CONFIG_BPF_SYSCALL
3012         struct sock *sk = sock->sk;
3013         const struct proto *prot = READ_ONCE(sk->sk_prot);
3014 
3015         if (prot != &unix_stream_proto)
3016                 return prot->recvmsg(sk, msg, size, flags, NULL);
3017 #endif
3018         return unix_stream_read_generic(&state, true);
3019 }
3020 
3021 static int unix_stream_splice_actor(struct sk_buff *skb,
3022                                     int skip, int chunk,
3023                                     struct unix_stream_read_state *state)
3024 {
3025         return skb_splice_bits(skb, state->socket->sk,
3026                                UNIXCB(skb).consumed + skip,
3027                                state->pipe, chunk, state->splice_flags);
3028 }
3029 
3030 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3031                                        struct pipe_inode_info *pipe,
3032                                        size_t size, unsigned int flags)
3033 {
3034         struct unix_stream_read_state state = {
3035                 .recv_actor = unix_stream_splice_actor,
3036                 .socket = sock,
3037                 .pipe = pipe,
3038                 .size = size,
3039                 .splice_flags = flags,
3040         };
3041 
3042         if (unlikely(*ppos))
3043                 return -ESPIPE;
3044 
3045         if (sock->file->f_flags & O_NONBLOCK ||
3046             flags & SPLICE_F_NONBLOCK)
3047                 state.flags = MSG_DONTWAIT;
3048 
3049         return unix_stream_read_generic(&state, false);
3050 }
3051 
3052 static int unix_shutdown(struct socket *sock, int mode)
3053 {
3054         struct sock *sk = sock->sk;
3055         struct sock *other;
3056 
3057         if (mode < SHUT_RD || mode > SHUT_RDWR)
3058                 return -EINVAL;
3059         /* This maps:
3060          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3061          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3062          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3063          */
3064         ++mode;
3065 
3066         unix_state_lock(sk);
3067         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3068         other = unix_peer(sk);
3069         if (other)
3070                 sock_hold(other);
3071         unix_state_unlock(sk);
3072         sk->sk_state_change(sk);
3073 
3074         if (other &&
3075                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3076 
3077                 int peer_mode = 0;
3078                 const struct proto *prot = READ_ONCE(other->sk_prot);
3079 
3080                 if (prot->unhash)
3081                         prot->unhash(other);
3082                 if (mode&RCV_SHUTDOWN)
3083                         peer_mode |= SEND_SHUTDOWN;
3084                 if (mode&SEND_SHUTDOWN)
3085                         peer_mode |= RCV_SHUTDOWN;
3086                 unix_state_lock(other);
3087                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3088                 unix_state_unlock(other);
3089                 other->sk_state_change(other);
3090                 if (peer_mode == SHUTDOWN_MASK)
3091                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3092                 else if (peer_mode & RCV_SHUTDOWN)
3093                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3094         }
3095         if (other)
3096                 sock_put(other);
3097 
3098         return 0;
3099 }
3100 
3101 long unix_inq_len(struct sock *sk)
3102 {
3103         struct sk_buff *skb;
3104         long amount = 0;
3105 
3106         if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3107                 return -EINVAL;
3108 
3109         spin_lock(&sk->sk_receive_queue.lock);
3110         if (sk->sk_type == SOCK_STREAM ||
3111             sk->sk_type == SOCK_SEQPACKET) {
3112                 skb_queue_walk(&sk->sk_receive_queue, skb)
3113                         amount += unix_skb_len(skb);
3114         } else {
3115                 skb = skb_peek(&sk->sk_receive_queue);
3116                 if (skb)
3117                         amount = skb->len;
3118         }
3119         spin_unlock(&sk->sk_receive_queue.lock);
3120 
3121         return amount;
3122 }
3123 EXPORT_SYMBOL_GPL(unix_inq_len);
3124 
3125 long unix_outq_len(struct sock *sk)
3126 {
3127         return sk_wmem_alloc_get(sk);
3128 }
3129 EXPORT_SYMBOL_GPL(unix_outq_len);
3130 
3131 static int unix_open_file(struct sock *sk)
3132 {
3133         struct path path;
3134         struct file *f;
3135         int fd;
3136 
3137         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3138                 return -EPERM;
3139 
3140         if (!smp_load_acquire(&unix_sk(sk)->addr))
3141                 return -ENOENT;
3142 
3143         path = unix_sk(sk)->path;
3144         if (!path.dentry)
3145                 return -ENOENT;
3146 
3147         path_get(&path);
3148 
3149         fd = get_unused_fd_flags(O_CLOEXEC);
3150         if (fd < 0)
3151                 goto out;
3152 
3153         f = dentry_open(&path, O_PATH, current_cred());
3154         if (IS_ERR(f)) {
3155                 put_unused_fd(fd);
3156                 fd = PTR_ERR(f);
3157                 goto out;
3158         }
3159 
3160         fd_install(fd, f);
3161 out:
3162         path_put(&path);
3163 
3164         return fd;
3165 }
3166 
3167 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3168 {
3169         struct sock *sk = sock->sk;
3170         long amount = 0;
3171         int err;
3172 
3173         switch (cmd) {
3174         case SIOCOUTQ:
3175                 amount = unix_outq_len(sk);
3176                 err = put_user(amount, (int __user *)arg);
3177                 break;
3178         case SIOCINQ:
3179                 amount = unix_inq_len(sk);
3180                 if (amount < 0)
3181                         err = amount;
3182                 else
3183                         err = put_user(amount, (int __user *)arg);
3184                 break;
3185         case SIOCUNIXFILE:
3186                 err = unix_open_file(sk);
3187                 break;
3188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3189         case SIOCATMARK:
3190                 {
3191                         struct unix_sock *u = unix_sk(sk);
3192                         struct sk_buff *skb;
3193                         int answ = 0;
3194 
3195                         mutex_lock(&u->iolock);
3196 
3197                         skb = skb_peek(&sk->sk_receive_queue);
3198                         if (skb) {
3199                                 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3200 
3201                                 if (skb == oob_skb ||
3202                                     (!oob_skb && !unix_skb_len(skb)))
3203                                         answ = 1;
3204                         }
3205 
3206                         mutex_unlock(&u->iolock);
3207 
3208                         err = put_user(answ, (int __user *)arg);
3209                 }
3210                 break;
3211 #endif
3212         default:
3213                 err = -ENOIOCTLCMD;
3214                 break;
3215         }
3216         return err;
3217 }
3218 
3219 #ifdef CONFIG_COMPAT
3220 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3221 {
3222         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3223 }
3224 #endif
3225 
3226 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3227 {
3228         struct sock *sk = sock->sk;
3229         unsigned char state;
3230         __poll_t mask;
3231         u8 shutdown;
3232 
3233         sock_poll_wait(file, sock, wait);
3234         mask = 0;
3235         shutdown = READ_ONCE(sk->sk_shutdown);
3236         state = READ_ONCE(sk->sk_state);
3237 
3238         /* exceptional events? */
3239         if (READ_ONCE(sk->sk_err))
3240                 mask |= EPOLLERR;
3241         if (shutdown == SHUTDOWN_MASK)
3242                 mask |= EPOLLHUP;
3243         if (shutdown & RCV_SHUTDOWN)
3244                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3245 
3246         /* readable? */
3247         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3248                 mask |= EPOLLIN | EPOLLRDNORM;
3249         if (sk_is_readable(sk))
3250                 mask |= EPOLLIN | EPOLLRDNORM;
3251 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3252         if (READ_ONCE(unix_sk(sk)->oob_skb))
3253                 mask |= EPOLLPRI;
3254 #endif
3255 
3256         /* Connection-based need to check for termination and startup */
3257         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3258             state == TCP_CLOSE)
3259                 mask |= EPOLLHUP;
3260 
3261         /*
3262          * we set writable also when the other side has shut down the
3263          * connection. This prevents stuck sockets.
3264          */
3265         if (unix_writable(sk, state))
3266                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3267 
3268         return mask;
3269 }
3270 
3271 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3272                                     poll_table *wait)
3273 {
3274         struct sock *sk = sock->sk, *other;
3275         unsigned int writable;
3276         unsigned char state;
3277         __poll_t mask;
3278         u8 shutdown;
3279 
3280         sock_poll_wait(file, sock, wait);
3281         mask = 0;
3282         shutdown = READ_ONCE(sk->sk_shutdown);
3283         state = READ_ONCE(sk->sk_state);
3284 
3285         /* exceptional events? */
3286         if (READ_ONCE(sk->sk_err) ||
3287             !skb_queue_empty_lockless(&sk->sk_error_queue))
3288                 mask |= EPOLLERR |
3289                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3290 
3291         if (shutdown & RCV_SHUTDOWN)
3292                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3293         if (shutdown == SHUTDOWN_MASK)
3294                 mask |= EPOLLHUP;
3295 
3296         /* readable? */
3297         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3298                 mask |= EPOLLIN | EPOLLRDNORM;
3299         if (sk_is_readable(sk))
3300                 mask |= EPOLLIN | EPOLLRDNORM;
3301 
3302         /* Connection-based need to check for termination and startup */
3303         if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3304                 mask |= EPOLLHUP;
3305 
3306         /* No write status requested, avoid expensive OUT tests. */
3307         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3308                 return mask;
3309 
3310         writable = unix_writable(sk, state);
3311         if (writable) {
3312                 unix_state_lock(sk);
3313 
3314                 other = unix_peer(sk);
3315                 if (other && unix_peer(other) != sk &&
3316                     unix_recvq_full_lockless(other) &&
3317                     unix_dgram_peer_wake_me(sk, other))
3318                         writable = 0;
3319 
3320                 unix_state_unlock(sk);
3321         }
3322 
3323         if (writable)
3324                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3325         else
3326                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3327 
3328         return mask;
3329 }
3330 
3331 #ifdef CONFIG_PROC_FS
3332 
3333 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3334 
3335 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3336 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3337 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3338 
3339 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3340 {
3341         unsigned long offset = get_offset(*pos);
3342         unsigned long bucket = get_bucket(*pos);
3343         unsigned long count = 0;
3344         struct sock *sk;
3345 
3346         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3347              sk; sk = sk_next(sk)) {
3348                 if (++count == offset)
3349                         break;
3350         }
3351 
3352         return sk;
3353 }
3354 
3355 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3356 {
3357         unsigned long bucket = get_bucket(*pos);
3358         struct net *net = seq_file_net(seq);
3359         struct sock *sk;
3360 
3361         while (bucket < UNIX_HASH_SIZE) {
3362                 spin_lock(&net->unx.table.locks[bucket]);
3363 
3364                 sk = unix_from_bucket(seq, pos);
3365                 if (sk)
3366                         return sk;
3367 
3368                 spin_unlock(&net->unx.table.locks[bucket]);
3369 
3370                 *pos = set_bucket_offset(++bucket, 1);
3371         }
3372 
3373         return NULL;
3374 }
3375 
3376 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3377                                   loff_t *pos)
3378 {
3379         unsigned long bucket = get_bucket(*pos);
3380 
3381         sk = sk_next(sk);
3382         if (sk)
3383                 return sk;
3384 
3385 
3386         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3387 
3388         *pos = set_bucket_offset(++bucket, 1);
3389 
3390         return unix_get_first(seq, pos);
3391 }
3392 
3393 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3394 {
3395         if (!*pos)
3396                 return SEQ_START_TOKEN;
3397 
3398         return unix_get_first(seq, pos);
3399 }
3400 
3401 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3402 {
3403         ++*pos;
3404 
3405         if (v == SEQ_START_TOKEN)
3406                 return unix_get_first(seq, pos);
3407 
3408         return unix_get_next(seq, v, pos);
3409 }
3410 
3411 static void unix_seq_stop(struct seq_file *seq, void *v)
3412 {
3413         struct sock *sk = v;
3414 
3415         if (sk)
3416                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3417 }
3418 
3419 static int unix_seq_show(struct seq_file *seq, void *v)
3420 {
3421 
3422         if (v == SEQ_START_TOKEN)
3423                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3424                          "Inode Path\n");
3425         else {
3426                 struct sock *s = v;
3427                 struct unix_sock *u = unix_sk(s);
3428                 unix_state_lock(s);
3429 
3430                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3431                         s,
3432                         refcount_read(&s->sk_refcnt),
3433                         0,
3434                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3435                         s->sk_type,
3436                         s->sk_socket ?
3437                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3438                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3439                         sock_i_ino(s));
3440 
3441                 if (u->addr) {  // under a hash table lock here
3442                         int i, len;
3443                         seq_putc(seq, ' ');
3444 
3445                         i = 0;
3446                         len = u->addr->len -
3447                                 offsetof(struct sockaddr_un, sun_path);
3448                         if (u->addr->name->sun_path[0]) {
3449                                 len--;
3450                         } else {
3451                                 seq_putc(seq, '@');
3452                                 i++;
3453                         }
3454                         for ( ; i < len; i++)
3455                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3456                                          '@');
3457                 }
3458                 unix_state_unlock(s);
3459                 seq_putc(seq, '\n');
3460         }
3461 
3462         return 0;
3463 }
3464 
3465 static const struct seq_operations unix_seq_ops = {
3466         .start  = unix_seq_start,
3467         .next   = unix_seq_next,
3468         .stop   = unix_seq_stop,
3469         .show   = unix_seq_show,
3470 };
3471 
3472 #ifdef CONFIG_BPF_SYSCALL
3473 struct bpf_unix_iter_state {
3474         struct seq_net_private p;
3475         unsigned int cur_sk;
3476         unsigned int end_sk;
3477         unsigned int max_sk;
3478         struct sock **batch;
3479         bool st_bucket_done;
3480 };
3481 
3482 struct bpf_iter__unix {
3483         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3484         __bpf_md_ptr(struct unix_sock *, unix_sk);
3485         uid_t uid __aligned(8);
3486 };
3487 
3488 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3489                               struct unix_sock *unix_sk, uid_t uid)
3490 {
3491         struct bpf_iter__unix ctx;
3492 
3493         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3494         ctx.meta = meta;
3495         ctx.unix_sk = unix_sk;
3496         ctx.uid = uid;
3497         return bpf_iter_run_prog(prog, &ctx);
3498 }
3499 
3500 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3501 
3502 {
3503         struct bpf_unix_iter_state *iter = seq->private;
3504         unsigned int expected = 1;
3505         struct sock *sk;
3506 
3507         sock_hold(start_sk);
3508         iter->batch[iter->end_sk++] = start_sk;
3509 
3510         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3511                 if (iter->end_sk < iter->max_sk) {
3512                         sock_hold(sk);
3513                         iter->batch[iter->end_sk++] = sk;
3514                 }
3515 
3516                 expected++;
3517         }
3518 
3519         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3520 
3521         return expected;
3522 }
3523 
3524 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3525 {
3526         while (iter->cur_sk < iter->end_sk)
3527                 sock_put(iter->batch[iter->cur_sk++]);
3528 }
3529 
3530 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3531                                        unsigned int new_batch_sz)
3532 {
3533         struct sock **new_batch;
3534 
3535         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3536                              GFP_USER | __GFP_NOWARN);
3537         if (!new_batch)
3538                 return -ENOMEM;
3539 
3540         bpf_iter_unix_put_batch(iter);
3541         kvfree(iter->batch);
3542         iter->batch = new_batch;
3543         iter->max_sk = new_batch_sz;
3544 
3545         return 0;
3546 }
3547 
3548 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3549                                         loff_t *pos)
3550 {
3551         struct bpf_unix_iter_state *iter = seq->private;
3552         unsigned int expected;
3553         bool resized = false;
3554         struct sock *sk;
3555 
3556         if (iter->st_bucket_done)
3557                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3558 
3559 again:
3560         /* Get a new batch */
3561         iter->cur_sk = 0;
3562         iter->end_sk = 0;
3563 
3564         sk = unix_get_first(seq, pos);
3565         if (!sk)
3566                 return NULL; /* Done */
3567 
3568         expected = bpf_iter_unix_hold_batch(seq, sk);
3569 
3570         if (iter->end_sk == expected) {
3571                 iter->st_bucket_done = true;
3572                 return sk;
3573         }
3574 
3575         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3576                 resized = true;
3577                 goto again;
3578         }
3579 
3580         return sk;
3581 }
3582 
3583 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3584 {
3585         if (!*pos)
3586                 return SEQ_START_TOKEN;
3587 
3588         /* bpf iter does not support lseek, so it always
3589          * continue from where it was stop()-ped.
3590          */
3591         return bpf_iter_unix_batch(seq, pos);
3592 }
3593 
3594 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3595 {
3596         struct bpf_unix_iter_state *iter = seq->private;
3597         struct sock *sk;
3598 
3599         /* Whenever seq_next() is called, the iter->cur_sk is
3600          * done with seq_show(), so advance to the next sk in
3601          * the batch.
3602          */
3603         if (iter->cur_sk < iter->end_sk)
3604                 sock_put(iter->batch[iter->cur_sk++]);
3605 
3606         ++*pos;
3607 
3608         if (iter->cur_sk < iter->end_sk)
3609                 sk = iter->batch[iter->cur_sk];
3610         else
3611                 sk = bpf_iter_unix_batch(seq, pos);
3612 
3613         return sk;
3614 }
3615 
3616 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3617 {
3618         struct bpf_iter_meta meta;
3619         struct bpf_prog *prog;
3620         struct sock *sk = v;
3621         uid_t uid;
3622         bool slow;
3623         int ret;
3624 
3625         if (v == SEQ_START_TOKEN)
3626                 return 0;
3627 
3628         slow = lock_sock_fast(sk);
3629 
3630         if (unlikely(sk_unhashed(sk))) {
3631                 ret = SEQ_SKIP;
3632                 goto unlock;
3633         }
3634 
3635         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3636         meta.seq = seq;
3637         prog = bpf_iter_get_info(&meta, false);
3638         ret = unix_prog_seq_show(prog, &meta, v, uid);
3639 unlock:
3640         unlock_sock_fast(sk, slow);
3641         return ret;
3642 }
3643 
3644 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3645 {
3646         struct bpf_unix_iter_state *iter = seq->private;
3647         struct bpf_iter_meta meta;
3648         struct bpf_prog *prog;
3649 
3650         if (!v) {
3651                 meta.seq = seq;
3652                 prog = bpf_iter_get_info(&meta, true);
3653                 if (prog)
3654                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3655         }
3656 
3657         if (iter->cur_sk < iter->end_sk)
3658                 bpf_iter_unix_put_batch(iter);
3659 }
3660 
3661 static const struct seq_operations bpf_iter_unix_seq_ops = {
3662         .start  = bpf_iter_unix_seq_start,
3663         .next   = bpf_iter_unix_seq_next,
3664         .stop   = bpf_iter_unix_seq_stop,
3665         .show   = bpf_iter_unix_seq_show,
3666 };
3667 #endif
3668 #endif
3669 
3670 static const struct net_proto_family unix_family_ops = {
3671         .family = PF_UNIX,
3672         .create = unix_create,
3673         .owner  = THIS_MODULE,
3674 };
3675 
3676 
3677 static int __net_init unix_net_init(struct net *net)
3678 {
3679         int i;
3680 
3681         net->unx.sysctl_max_dgram_qlen = 10;
3682         if (unix_sysctl_register(net))
3683                 goto out;
3684 
3685 #ifdef CONFIG_PROC_FS
3686         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3687                              sizeof(struct seq_net_private)))
3688                 goto err_sysctl;
3689 #endif
3690 
3691         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3692                                               sizeof(spinlock_t), GFP_KERNEL);
3693         if (!net->unx.table.locks)
3694                 goto err_proc;
3695 
3696         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3697                                                 sizeof(struct hlist_head),
3698                                                 GFP_KERNEL);
3699         if (!net->unx.table.buckets)
3700                 goto free_locks;
3701 
3702         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3703                 spin_lock_init(&net->unx.table.locks[i]);
3704                 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3705                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3706         }
3707 
3708         return 0;
3709 
3710 free_locks:
3711         kvfree(net->unx.table.locks);
3712 err_proc:
3713 #ifdef CONFIG_PROC_FS
3714         remove_proc_entry("unix", net->proc_net);
3715 err_sysctl:
3716 #endif
3717         unix_sysctl_unregister(net);
3718 out:
3719         return -ENOMEM;
3720 }
3721 
3722 static void __net_exit unix_net_exit(struct net *net)
3723 {
3724         kvfree(net->unx.table.buckets);
3725         kvfree(net->unx.table.locks);
3726         unix_sysctl_unregister(net);
3727         remove_proc_entry("unix", net->proc_net);
3728 }
3729 
3730 static struct pernet_operations unix_net_ops = {
3731         .init = unix_net_init,
3732         .exit = unix_net_exit,
3733 };
3734 
3735 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3736 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3737                      struct unix_sock *unix_sk, uid_t uid)
3738 
3739 #define INIT_BATCH_SZ 16
3740 
3741 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3742 {
3743         struct bpf_unix_iter_state *iter = priv_data;
3744         int err;
3745 
3746         err = bpf_iter_init_seq_net(priv_data, aux);
3747         if (err)
3748                 return err;
3749 
3750         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3751         if (err) {
3752                 bpf_iter_fini_seq_net(priv_data);
3753                 return err;
3754         }
3755 
3756         return 0;
3757 }
3758 
3759 static void bpf_iter_fini_unix(void *priv_data)
3760 {
3761         struct bpf_unix_iter_state *iter = priv_data;
3762 
3763         bpf_iter_fini_seq_net(priv_data);
3764         kvfree(iter->batch);
3765 }
3766 
3767 static const struct bpf_iter_seq_info unix_seq_info = {
3768         .seq_ops                = &bpf_iter_unix_seq_ops,
3769         .init_seq_private       = bpf_iter_init_unix,
3770         .fini_seq_private       = bpf_iter_fini_unix,
3771         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3772 };
3773 
3774 static const struct bpf_func_proto *
3775 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3776                              const struct bpf_prog *prog)
3777 {
3778         switch (func_id) {
3779         case BPF_FUNC_setsockopt:
3780                 return &bpf_sk_setsockopt_proto;
3781         case BPF_FUNC_getsockopt:
3782                 return &bpf_sk_getsockopt_proto;
3783         default:
3784                 return NULL;
3785         }
3786 }
3787 
3788 static struct bpf_iter_reg unix_reg_info = {
3789         .target                 = "unix",
3790         .ctx_arg_info_size      = 1,
3791         .ctx_arg_info           = {
3792                 { offsetof(struct bpf_iter__unix, unix_sk),
3793                   PTR_TO_BTF_ID_OR_NULL },
3794         },
3795         .get_func_proto         = bpf_iter_unix_get_func_proto,
3796         .seq_info               = &unix_seq_info,
3797 };
3798 
3799 static void __init bpf_iter_register(void)
3800 {
3801         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3802         if (bpf_iter_reg_target(&unix_reg_info))
3803                 pr_warn("Warning: could not register bpf iterator unix\n");
3804 }
3805 #endif
3806 
3807 static int __init af_unix_init(void)
3808 {
3809         int i, rc = -1;
3810 
3811         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3812 
3813         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3814                 spin_lock_init(&bsd_socket_locks[i]);
3815                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3816         }
3817 
3818         rc = proto_register(&unix_dgram_proto, 1);
3819         if (rc != 0) {
3820                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3821                 goto out;
3822         }
3823 
3824         rc = proto_register(&unix_stream_proto, 1);
3825         if (rc != 0) {
3826                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3827                 proto_unregister(&unix_dgram_proto);
3828                 goto out;
3829         }
3830 
3831         sock_register(&unix_family_ops);
3832         register_pernet_subsys(&unix_net_ops);
3833         unix_bpf_build_proto();
3834 
3835 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3836         bpf_iter_register();
3837 #endif
3838 
3839 out:
3840         return rc;
3841 }
3842 
3843 /* Later than subsys_initcall() because we depend on stuff initialised there */
3844 fs_initcall(af_unix_init);
3845 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php