~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
tcp_ipv4.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  4  *              operating system.  INET is implemented using the  BSD Socket
  5  *              interface as the means of communication with the user level.
  6  *
  7  *              Implementation of the Transmission Control Protocol(TCP).
  8  *
  9  *              IPv4 specific functions
 10  *
 11  *              code split from:
 12  *              linux/ipv4/tcp.c
 13  *              linux/ipv4/tcp_input.c
 14  *              linux/ipv4/tcp_output.c
 15  *
 16  *              See tcp.c for author information
 17  */
 18 
 19 /*
 20  * Changes:
 21  *              David S. Miller :       New socket lookup architecture.
 22  *                                      This code is dedicated to John Dyson.
 23  *              David S. Miller :       Change semantics of established hash,
 24  *                                      half is devoted to TIME_WAIT sockets
 25  *                                      and the rest go in the other half.
 26  *              Andi Kleen :            Add support for syncookies and fixed
 27  *                                      some bugs: ip options weren't passed to
 28  *                                      the TCP layer, missed a check for an
 29  *                                      ACK bit.
 30  *              Andi Kleen :            Implemented fast path mtu discovery.
 31  *                                      Fixed many serious bugs in the
 32  *                                      request_sock handling and moved
 33  *                                      most of it into the af independent code.
 34  *                                      Added tail drop and some other bugfixes.
 35  *                                      Added new listen semantics.
 36  *              Mike McLagan    :       Routing by source
 37  *      Juan Jose Ciarlante:            ip_dynaddr bits
 38  *              Andi Kleen:             various fixes.
 39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
 40  *                                      coma.
 41  *      Andi Kleen              :       Fix new listen.
 42  *      Andi Kleen              :       Fix accept error reporting.
 43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
 44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 45  *                                      a single port at the same time.
 46  */
 47 
 48 #define pr_fmt(fmt) "TCP: " fmt
 49 
 50 #include <linux/bottom_half.h>
 51 #include <linux/types.h>
 52 #include <linux/fcntl.h>
 53 #include <linux/module.h>
 54 #include <linux/random.h>
 55 #include <linux/cache.h>
 56 #include <linux/jhash.h>
 57 #include <linux/init.h>
 58 #include <linux/times.h>
 59 #include <linux/slab.h>
 60 #include <linux/sched.h>
 61 
 62 #include <net/net_namespace.h>
 63 #include <net/icmp.h>
 64 #include <net/inet_hashtables.h>
 65 #include <net/tcp.h>
 66 #include <net/transp_v6.h>
 67 #include <net/ipv6.h>
 68 #include <net/inet_common.h>
 69 #include <net/timewait_sock.h>
 70 #include <net/xfrm.h>
 71 #include <net/secure_seq.h>
 72 #include <net/busy_poll.h>
 73 #include <net/rstreason.h>
 74 
 75 #include <linux/inet.h>
 76 #include <linux/ipv6.h>
 77 #include <linux/stddef.h>
 78 #include <linux/proc_fs.h>
 79 #include <linux/seq_file.h>
 80 #include <linux/inetdevice.h>
 81 #include <linux/btf_ids.h>
 82 
 83 #include <crypto/hash.h>
 84 #include <linux/scatterlist.h>
 85 
 86 #include <trace/events/tcp.h>
 87 
 88 #ifdef CONFIG_TCP_MD5SIG
 89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 90                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
 91 #endif
 92 
 93 struct inet_hashinfo tcp_hashinfo;
 94 EXPORT_SYMBOL(tcp_hashinfo);
 95 
 96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
 97         .bh_lock = INIT_LOCAL_LOCK(bh_lock),
 98 };
 99 
100 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
101 {
102         return secure_tcp_seq(ip_hdr(skb)->daddr,
103                               ip_hdr(skb)->saddr,
104                               tcp_hdr(skb)->dest,
105                               tcp_hdr(skb)->source);
106 }
107 
108 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
109 {
110         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
111 }
112 
113 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
114 {
115         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
116         const struct inet_timewait_sock *tw = inet_twsk(sktw);
117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118         struct tcp_sock *tp = tcp_sk(sk);
119         int ts_recent_stamp;
120 
121         if (reuse == 2) {
122                 /* Still does not detect *everything* that goes through
123                  * lo, since we require a loopback src or dst address
124                  * or direct binding to 'lo' interface.
125                  */
126                 bool loopback = false;
127                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
128                         loopback = true;
129 #if IS_ENABLED(CONFIG_IPV6)
130                 if (tw->tw_family == AF_INET6) {
131                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
132                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
133                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
134                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
135                                 loopback = true;
136                 } else
137 #endif
138                 {
139                         if (ipv4_is_loopback(tw->tw_daddr) ||
140                             ipv4_is_loopback(tw->tw_rcv_saddr))
141                                 loopback = true;
142                 }
143                 if (!loopback)
144                         reuse = 0;
145         }
146 
147         /* With PAWS, it is safe from the viewpoint
148            of data integrity. Even without PAWS it is safe provided sequence
149            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
150 
151            Actually, the idea is close to VJ's one, only timestamp cache is
152            held not per host, but per port pair and TW bucket is used as state
153            holder.
154 
155            If TW bucket has been already destroyed we fall back to VJ's scheme
156            and use initial timestamp retrieved from peer table.
157          */
158         ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
159         if (ts_recent_stamp &&
160             (!twp || (reuse && time_after32(ktime_get_seconds(),
161                                             ts_recent_stamp)))) {
162                 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
163                  * and releasing the bucket lock.
164                  */
165                 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
166                         return 0;
167 
168                 /* In case of repair and re-using TIME-WAIT sockets we still
169                  * want to be sure that it is safe as above but honor the
170                  * sequence numbers and time stamps set as part of the repair
171                  * process.
172                  *
173                  * Without this check re-using a TIME-WAIT socket with TCP
174                  * repair would accumulate a -1 on the repair assigned
175                  * sequence number. The first time it is reused the sequence
176                  * is -1, the second time -2, etc. This fixes that issue
177                  * without appearing to create any others.
178                  */
179                 if (likely(!tp->repair)) {
180                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
181 
182                         if (!seq)
183                                 seq = 1;
184                         WRITE_ONCE(tp->write_seq, seq);
185                         tp->rx_opt.ts_recent       = READ_ONCE(tcptw->tw_ts_recent);
186                         tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
187                 }
188 
189                 return 1;
190         }
191 
192         return 0;
193 }
194 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
195 
196 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
197                               int addr_len)
198 {
199         /* This check is replicated from tcp_v4_connect() and intended to
200          * prevent BPF program called below from accessing bytes that are out
201          * of the bound specified by user in addr_len.
202          */
203         if (addr_len < sizeof(struct sockaddr_in))
204                 return -EINVAL;
205 
206         sock_owned_by_me(sk);
207 
208         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
209 }
210 
211 /* This will initiate an outgoing connection. */
212 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
213 {
214         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
215         struct inet_timewait_death_row *tcp_death_row;
216         struct inet_sock *inet = inet_sk(sk);
217         struct tcp_sock *tp = tcp_sk(sk);
218         struct ip_options_rcu *inet_opt;
219         struct net *net = sock_net(sk);
220         __be16 orig_sport, orig_dport;
221         __be32 daddr, nexthop;
222         struct flowi4 *fl4;
223         struct rtable *rt;
224         int err;
225 
226         if (addr_len < sizeof(struct sockaddr_in))
227                 return -EINVAL;
228 
229         if (usin->sin_family != AF_INET)
230                 return -EAFNOSUPPORT;
231 
232         nexthop = daddr = usin->sin_addr.s_addr;
233         inet_opt = rcu_dereference_protected(inet->inet_opt,
234                                              lockdep_sock_is_held(sk));
235         if (inet_opt && inet_opt->opt.srr) {
236                 if (!daddr)
237                         return -EINVAL;
238                 nexthop = inet_opt->opt.faddr;
239         }
240 
241         orig_sport = inet->inet_sport;
242         orig_dport = usin->sin_port;
243         fl4 = &inet->cork.fl.u.ip4;
244         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
245                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
246                               orig_dport, sk);
247         if (IS_ERR(rt)) {
248                 err = PTR_ERR(rt);
249                 if (err == -ENETUNREACH)
250                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
251                 return err;
252         }
253 
254         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
255                 ip_rt_put(rt);
256                 return -ENETUNREACH;
257         }
258 
259         if (!inet_opt || !inet_opt->opt.srr)
260                 daddr = fl4->daddr;
261 
262         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
263 
264         if (!inet->inet_saddr) {
265                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
266                 if (err) {
267                         ip_rt_put(rt);
268                         return err;
269                 }
270         } else {
271                 sk_rcv_saddr_set(sk, inet->inet_saddr);
272         }
273 
274         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275                 /* Reset inherited state */
276                 tp->rx_opt.ts_recent       = 0;
277                 tp->rx_opt.ts_recent_stamp = 0;
278                 if (likely(!tp->repair))
279                         WRITE_ONCE(tp->write_seq, 0);
280         }
281 
282         inet->inet_dport = usin->sin_port;
283         sk_daddr_set(sk, daddr);
284 
285         inet_csk(sk)->icsk_ext_hdr_len = 0;
286         if (inet_opt)
287                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
288 
289         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
290 
291         /* Socket identity is still unknown (sport may be zero).
292          * However we set state to SYN-SENT and not releasing socket
293          * lock select source port, enter ourselves into the hash tables and
294          * complete initialization after this.
295          */
296         tcp_set_state(sk, TCP_SYN_SENT);
297         err = inet_hash_connect(tcp_death_row, sk);
298         if (err)
299                 goto failure;
300 
301         sk_set_txhash(sk);
302 
303         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304                                inet->inet_sport, inet->inet_dport, sk);
305         if (IS_ERR(rt)) {
306                 err = PTR_ERR(rt);
307                 rt = NULL;
308                 goto failure;
309         }
310         tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
311         /* OK, now commit destination to socket.  */
312         sk->sk_gso_type = SKB_GSO_TCPV4;
313         sk_setup_caps(sk, &rt->dst);
314         rt = NULL;
315 
316         if (likely(!tp->repair)) {
317                 if (!tp->write_seq)
318                         WRITE_ONCE(tp->write_seq,
319                                    secure_tcp_seq(inet->inet_saddr,
320                                                   inet->inet_daddr,
321                                                   inet->inet_sport,
322                                                   usin->sin_port));
323                 WRITE_ONCE(tp->tsoffset,
324                            secure_tcp_ts_off(net, inet->inet_saddr,
325                                              inet->inet_daddr));
326         }
327 
328         atomic_set(&inet->inet_id, get_random_u16());
329 
330         if (tcp_fastopen_defer_connect(sk, &err))
331                 return err;
332         if (err)
333                 goto failure;
334 
335         err = tcp_connect(sk);
336 
337         if (err)
338                 goto failure;
339 
340         return 0;
341 
342 failure:
343         /*
344          * This unhashes the socket and releases the local port,
345          * if necessary.
346          */
347         tcp_set_state(sk, TCP_CLOSE);
348         inet_bhash2_reset_saddr(sk);
349         ip_rt_put(rt);
350         sk->sk_route_caps = 0;
351         inet->inet_dport = 0;
352         return err;
353 }
354 EXPORT_SYMBOL(tcp_v4_connect);
355 
356 /*
357  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
358  * It can be called through tcp_release_cb() if socket was owned by user
359  * at the time tcp_v4_err() was called to handle ICMP message.
360  */
361 void tcp_v4_mtu_reduced(struct sock *sk)
362 {
363         struct inet_sock *inet = inet_sk(sk);
364         struct dst_entry *dst;
365         u32 mtu;
366 
367         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
368                 return;
369         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
370         dst = inet_csk_update_pmtu(sk, mtu);
371         if (!dst)
372                 return;
373 
374         /* Something is about to be wrong... Remember soft error
375          * for the case, if this connection will not able to recover.
376          */
377         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
378                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
379 
380         mtu = dst_mtu(dst);
381 
382         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
383             ip_sk_accept_pmtu(sk) &&
384             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
385                 tcp_sync_mss(sk, mtu);
386 
387                 /* Resend the TCP packet because it's
388                  * clear that the old packet has been
389                  * dropped. This is the new "fast" path mtu
390                  * discovery.
391                  */
392                 tcp_simple_retransmit(sk);
393         } /* else let the usual retransmit timer handle it */
394 }
395 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
396 
397 static void do_redirect(struct sk_buff *skb, struct sock *sk)
398 {
399         struct dst_entry *dst = __sk_dst_check(sk, 0);
400 
401         if (dst)
402                 dst->ops->redirect(dst, sk, skb);
403 }
404 
405 
406 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
407 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
408 {
409         struct request_sock *req = inet_reqsk(sk);
410         struct net *net = sock_net(sk);
411 
412         /* ICMPs are not backlogged, hence we cannot get
413          * an established socket here.
414          */
415         if (seq != tcp_rsk(req)->snt_isn) {
416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417         } else if (abort) {
418                 /*
419                  * Still in SYN_RECV, just remove it silently.
420                  * There is no good way to pass the error to the newly
421                  * created socket, and POSIX does not want network
422                  * errors returned from accept().
423                  */
424                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
425                 tcp_listendrop(req->rsk_listener);
426         }
427         reqsk_put(req);
428 }
429 EXPORT_SYMBOL(tcp_req_err);
430 
431 /* TCP-LD (RFC 6069) logic */
432 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
433 {
434         struct inet_connection_sock *icsk = inet_csk(sk);
435         struct tcp_sock *tp = tcp_sk(sk);
436         struct sk_buff *skb;
437         s32 remaining;
438         u32 delta_us;
439 
440         if (sock_owned_by_user(sk))
441                 return;
442 
443         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
444             !icsk->icsk_backoff)
445                 return;
446 
447         skb = tcp_rtx_queue_head(sk);
448         if (WARN_ON_ONCE(!skb))
449                 return;
450 
451         icsk->icsk_backoff--;
452         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
453         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
454 
455         tcp_mstamp_refresh(tp);
456         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
457         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
458 
459         if (remaining > 0) {
460                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
461                                           remaining, TCP_RTO_MAX);
462         } else {
463                 /* RTO revert clocked out retransmission.
464                  * Will retransmit now.
465                  */
466                 tcp_retransmit_timer(sk);
467         }
468 }
469 EXPORT_SYMBOL(tcp_ld_RTO_revert);
470 
471 /*
472  * This routine is called by the ICMP module when it gets some
473  * sort of error condition.  If err < 0 then the socket should
474  * be closed and the error returned to the user.  If err > 0
475  * it's just the icmp type << 8 | icmp code.  After adjustment
476  * header points to the first 8 bytes of the tcp header.  We need
477  * to find the appropriate port.
478  *
479  * The locking strategy used here is very "optimistic". When
480  * someone else accesses the socket the ICMP is just dropped
481  * and for some paths there is no check at all.
482  * A more general error queue to queue errors for later handling
483  * is probably better.
484  *
485  */
486 
487 int tcp_v4_err(struct sk_buff *skb, u32 info)
488 {
489         const struct iphdr *iph = (const struct iphdr *)skb->data;
490         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
491         struct tcp_sock *tp;
492         const int type = icmp_hdr(skb)->type;
493         const int code = icmp_hdr(skb)->code;
494         struct sock *sk;
495         struct request_sock *fastopen;
496         u32 seq, snd_una;
497         int err;
498         struct net *net = dev_net(skb->dev);
499 
500         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
501                                        iph->daddr, th->dest, iph->saddr,
502                                        ntohs(th->source), inet_iif(skb), 0);
503         if (!sk) {
504                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
505                 return -ENOENT;
506         }
507         if (sk->sk_state == TCP_TIME_WAIT) {
508                 /* To increase the counter of ignored icmps for TCP-AO */
509                 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
510                 inet_twsk_put(inet_twsk(sk));
511                 return 0;
512         }
513         seq = ntohl(th->seq);
514         if (sk->sk_state == TCP_NEW_SYN_RECV) {
515                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
516                                      type == ICMP_TIME_EXCEEDED ||
517                                      (type == ICMP_DEST_UNREACH &&
518                                       (code == ICMP_NET_UNREACH ||
519                                        code == ICMP_HOST_UNREACH)));
520                 return 0;
521         }
522 
523         if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
524                 sock_put(sk);
525                 return 0;
526         }
527 
528         bh_lock_sock(sk);
529         /* If too many ICMPs get dropped on busy
530          * servers this needs to be solved differently.
531          * We do take care of PMTU discovery (RFC1191) special case :
532          * we can receive locally generated ICMP messages while socket is held.
533          */
534         if (sock_owned_by_user(sk)) {
535                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
536                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
537         }
538         if (sk->sk_state == TCP_CLOSE)
539                 goto out;
540 
541         if (static_branch_unlikely(&ip4_min_ttl)) {
542                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
543                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
544                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
545                         goto out;
546                 }
547         }
548 
549         tp = tcp_sk(sk);
550         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
551         fastopen = rcu_dereference(tp->fastopen_rsk);
552         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
553         if (sk->sk_state != TCP_LISTEN &&
554             !between(seq, snd_una, tp->snd_nxt)) {
555                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
556                 goto out;
557         }
558 
559         switch (type) {
560         case ICMP_REDIRECT:
561                 if (!sock_owned_by_user(sk))
562                         do_redirect(skb, sk);
563                 goto out;
564         case ICMP_SOURCE_QUENCH:
565                 /* Just silently ignore these. */
566                 goto out;
567         case ICMP_PARAMETERPROB:
568                 err = EPROTO;
569                 break;
570         case ICMP_DEST_UNREACH:
571                 if (code > NR_ICMP_UNREACH)
572                         goto out;
573 
574                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
575                         /* We are not interested in TCP_LISTEN and open_requests
576                          * (SYN-ACKs send out by Linux are always <576bytes so
577                          * they should go through unfragmented).
578                          */
579                         if (sk->sk_state == TCP_LISTEN)
580                                 goto out;
581 
582                         WRITE_ONCE(tp->mtu_info, info);
583                         if (!sock_owned_by_user(sk)) {
584                                 tcp_v4_mtu_reduced(sk);
585                         } else {
586                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
587                                         sock_hold(sk);
588                         }
589                         goto out;
590                 }
591 
592                 err = icmp_err_convert[code].errno;
593                 /* check if this ICMP message allows revert of backoff.
594                  * (see RFC 6069)
595                  */
596                 if (!fastopen &&
597                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
598                         tcp_ld_RTO_revert(sk, seq);
599                 break;
600         case ICMP_TIME_EXCEEDED:
601                 err = EHOSTUNREACH;
602                 break;
603         default:
604                 goto out;
605         }
606 
607         switch (sk->sk_state) {
608         case TCP_SYN_SENT:
609         case TCP_SYN_RECV:
610                 /* Only in fast or simultaneous open. If a fast open socket is
611                  * already accepted it is treated as a connected one below.
612                  */
613                 if (fastopen && !fastopen->sk)
614                         break;
615 
616                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
617 
618                 if (!sock_owned_by_user(sk))
619                         tcp_done_with_error(sk, err);
620                 else
621                         WRITE_ONCE(sk->sk_err_soft, err);
622                 goto out;
623         }
624 
625         /* If we've already connected we will keep trying
626          * until we time out, or the user gives up.
627          *
628          * rfc1122 4.2.3.9 allows to consider as hard errors
629          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
630          * but it is obsoleted by pmtu discovery).
631          *
632          * Note, that in modern internet, where routing is unreliable
633          * and in each dark corner broken firewalls sit, sending random
634          * errors ordered by their masters even this two messages finally lose
635          * their original sense (even Linux sends invalid PORT_UNREACHs)
636          *
637          * Now we are in compliance with RFCs.
638          *                                                      --ANK (980905)
639          */
640 
641         if (!sock_owned_by_user(sk) &&
642             inet_test_bit(RECVERR, sk)) {
643                 WRITE_ONCE(sk->sk_err, err);
644                 sk_error_report(sk);
645         } else  { /* Only an error on timeout */
646                 WRITE_ONCE(sk->sk_err_soft, err);
647         }
648 
649 out:
650         bh_unlock_sock(sk);
651         sock_put(sk);
652         return 0;
653 }
654 
655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
656 {
657         struct tcphdr *th = tcp_hdr(skb);
658 
659         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
660         skb->csum_start = skb_transport_header(skb) - skb->head;
661         skb->csum_offset = offsetof(struct tcphdr, check);
662 }
663 
664 /* This routine computes an IPv4 TCP checksum. */
665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
666 {
667         const struct inet_sock *inet = inet_sk(sk);
668 
669         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
670 }
671 EXPORT_SYMBOL(tcp_v4_send_check);
672 
673 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
674 
675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
676                                  const struct tcp_ao_hdr *aoh,
677                                  struct ip_reply_arg *arg, struct tcphdr *reply,
678                                  __be32 reply_options[REPLY_OPTIONS_LEN])
679 {
680 #ifdef CONFIG_TCP_AO
681         int sdif = tcp_v4_sdif(skb);
682         int dif = inet_iif(skb);
683         int l3index = sdif ? dif : 0;
684         bool allocated_traffic_key;
685         struct tcp_ao_key *key;
686         char *traffic_key;
687         bool drop = true;
688         u32 ao_sne = 0;
689         u8 keyid;
690 
691         rcu_read_lock();
692         if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
693                                  &key, &traffic_key, &allocated_traffic_key,
694                                  &keyid, &ao_sne))
695                 goto out;
696 
697         reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
698                                  (aoh->rnext_keyid << 8) | keyid);
699         arg->iov[0].iov_len += tcp_ao_len_aligned(key);
700         reply->doff = arg->iov[0].iov_len / 4;
701 
702         if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
703                             key, traffic_key,
704                             (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
705                             (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
706                             reply, ao_sne))
707                 goto out;
708         drop = false;
709 out:
710         rcu_read_unlock();
711         if (allocated_traffic_key)
712                 kfree(traffic_key);
713         return drop;
714 #else
715         return true;
716 #endif
717 }
718 
719 /*
720  *      This routine will send an RST to the other tcp.
721  *
722  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
723  *                    for reset.
724  *      Answer: if a packet caused RST, it is not for a socket
725  *              existing in our system, if it is matched to a socket,
726  *              it is just duplicate segment or bug in other side's TCP.
727  *              So that we build reply only basing on parameters
728  *              arrived with segment.
729  *      Exception: precedence violation. We do not implement it in any case.
730  */
731 
732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
733                               enum sk_rst_reason reason)
734 {
735         const struct tcphdr *th = tcp_hdr(skb);
736         struct {
737                 struct tcphdr th;
738                 __be32 opt[REPLY_OPTIONS_LEN];
739         } rep;
740         const __u8 *md5_hash_location = NULL;
741         const struct tcp_ao_hdr *aoh;
742         struct ip_reply_arg arg;
743 #ifdef CONFIG_TCP_MD5SIG
744         struct tcp_md5sig_key *key = NULL;
745         unsigned char newhash[16];
746         struct sock *sk1 = NULL;
747         int genhash;
748 #endif
749         u64 transmit_time = 0;
750         struct sock *ctl_sk;
751         struct net *net;
752         u32 txhash = 0;
753 
754         /* Never send a reset in response to a reset. */
755         if (th->rst)
756                 return;
757 
758         /* If sk not NULL, it means we did a successful lookup and incoming
759          * route had to be correct. prequeue might have dropped our dst.
760          */
761         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
762                 return;
763 
764         /* Swap the send and the receive. */
765         memset(&rep, 0, sizeof(rep));
766         rep.th.dest   = th->source;
767         rep.th.source = th->dest;
768         rep.th.doff   = sizeof(struct tcphdr) / 4;
769         rep.th.rst    = 1;
770 
771         if (th->ack) {
772                 rep.th.seq = th->ack_seq;
773         } else {
774                 rep.th.ack = 1;
775                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
776                                        skb->len - (th->doff << 2));
777         }
778 
779         memset(&arg, 0, sizeof(arg));
780         arg.iov[0].iov_base = (unsigned char *)&rep;
781         arg.iov[0].iov_len  = sizeof(rep.th);
782 
783         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
784 
785         /* Invalid TCP option size or twice included auth */
786         if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
787                 return;
788 
789         if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
790                 return;
791 
792 #ifdef CONFIG_TCP_MD5SIG
793         rcu_read_lock();
794         if (sk && sk_fullsock(sk)) {
795                 const union tcp_md5_addr *addr;
796                 int l3index;
797 
798                 /* sdif set, means packet ingressed via a device
799                  * in an L3 domain and inet_iif is set to it.
800                  */
801                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
802                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
803                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
804         } else if (md5_hash_location) {
805                 const union tcp_md5_addr *addr;
806                 int sdif = tcp_v4_sdif(skb);
807                 int dif = inet_iif(skb);
808                 int l3index;
809 
810                 /*
811                  * active side is lost. Try to find listening socket through
812                  * source port, and then find md5 key through listening socket.
813                  * we are not loose security here:
814                  * Incoming packet is checked with md5 hash with finding key,
815                  * no RST generated if md5 hash doesn't match.
816                  */
817                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
818                                              NULL, 0, ip_hdr(skb)->saddr,
819                                              th->source, ip_hdr(skb)->daddr,
820                                              ntohs(th->source), dif, sdif);
821                 /* don't send rst if it can't find key */
822                 if (!sk1)
823                         goto out;
824 
825                 /* sdif set, means packet ingressed via a device
826                  * in an L3 domain and dif is set to it.
827                  */
828                 l3index = sdif ? dif : 0;
829                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
830                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
831                 if (!key)
832                         goto out;
833 
834 
835                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
836                 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
837                         goto out;
838 
839         }
840 
841         if (key) {
842                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
843                                    (TCPOPT_NOP << 16) |
844                                    (TCPOPT_MD5SIG << 8) |
845                                    TCPOLEN_MD5SIG);
846                 /* Update length and the length the header thinks exists */
847                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
848                 rep.th.doff = arg.iov[0].iov_len / 4;
849 
850                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
851                                      key, ip_hdr(skb)->saddr,
852                                      ip_hdr(skb)->daddr, &rep.th);
853         }
854 #endif
855         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
856         if (rep.opt[0] == 0) {
857                 __be32 mrst = mptcp_reset_option(skb);
858 
859                 if (mrst) {
860                         rep.opt[0] = mrst;
861                         arg.iov[0].iov_len += sizeof(mrst);
862                         rep.th.doff = arg.iov[0].iov_len / 4;
863                 }
864         }
865 
866         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
867                                       ip_hdr(skb)->saddr, /* XXX */
868                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
869         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
870         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
871 
872         /* When socket is gone, all binding information is lost.
873          * routing might fail in this case. No choice here, if we choose to force
874          * input interface, we will misroute in case of asymmetric route.
875          */
876         if (sk)
877                 arg.bound_dev_if = sk->sk_bound_dev_if;
878 
879         trace_tcp_send_reset(sk, skb, reason);
880 
881         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
882                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
883 
884         arg.tos = ip_hdr(skb)->tos;
885         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
886         local_bh_disable();
887         local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
888         ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
889 
890         sock_net_set(ctl_sk, net);
891         if (sk) {
892                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
893                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
894                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
895                                    inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
896                 transmit_time = tcp_transmit_time(sk);
897                 xfrm_sk_clone_policy(ctl_sk, sk);
898                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
899                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
900         } else {
901                 ctl_sk->sk_mark = 0;
902                 ctl_sk->sk_priority = 0;
903         }
904         ip_send_unicast_reply(ctl_sk,
905                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
906                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
907                               &arg, arg.iov[0].iov_len,
908                               transmit_time, txhash);
909 
910         xfrm_sk_free_policy(ctl_sk);
911         sock_net_set(ctl_sk, &init_net);
912         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
913         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
914         local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
915         local_bh_enable();
916 
917 #ifdef CONFIG_TCP_MD5SIG
918 out:
919         rcu_read_unlock();
920 #endif
921 }
922 
923 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
924    outside socket context is ugly, certainly. What can I do?
925  */
926 
927 static void tcp_v4_send_ack(const struct sock *sk,
928                             struct sk_buff *skb, u32 seq, u32 ack,
929                             u32 win, u32 tsval, u32 tsecr, int oif,
930                             struct tcp_key *key,
931                             int reply_flags, u8 tos, u32 txhash)
932 {
933         const struct tcphdr *th = tcp_hdr(skb);
934         struct {
935                 struct tcphdr th;
936                 __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
937         } rep;
938         struct net *net = sock_net(sk);
939         struct ip_reply_arg arg;
940         struct sock *ctl_sk;
941         u64 transmit_time;
942 
943         memset(&rep.th, 0, sizeof(struct tcphdr));
944         memset(&arg, 0, sizeof(arg));
945 
946         arg.iov[0].iov_base = (unsigned char *)&rep;
947         arg.iov[0].iov_len  = sizeof(rep.th);
948         if (tsecr) {
949                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
950                                    (TCPOPT_TIMESTAMP << 8) |
951                                    TCPOLEN_TIMESTAMP);
952                 rep.opt[1] = htonl(tsval);
953                 rep.opt[2] = htonl(tsecr);
954                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
955         }
956 
957         /* Swap the send and the receive. */
958         rep.th.dest    = th->source;
959         rep.th.source  = th->dest;
960         rep.th.doff    = arg.iov[0].iov_len / 4;
961         rep.th.seq     = htonl(seq);
962         rep.th.ack_seq = htonl(ack);
963         rep.th.ack     = 1;
964         rep.th.window  = htons(win);
965 
966 #ifdef CONFIG_TCP_MD5SIG
967         if (tcp_key_is_md5(key)) {
968                 int offset = (tsecr) ? 3 : 0;
969 
970                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
971                                           (TCPOPT_NOP << 16) |
972                                           (TCPOPT_MD5SIG << 8) |
973                                           TCPOLEN_MD5SIG);
974                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
975                 rep.th.doff = arg.iov[0].iov_len/4;
976 
977                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
978                                     key->md5_key, ip_hdr(skb)->saddr,
979                                     ip_hdr(skb)->daddr, &rep.th);
980         }
981 #endif
982 #ifdef CONFIG_TCP_AO
983         if (tcp_key_is_ao(key)) {
984                 int offset = (tsecr) ? 3 : 0;
985 
986                 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
987                                           (tcp_ao_len(key->ao_key) << 16) |
988                                           (key->ao_key->sndid << 8) |
989                                           key->rcv_next);
990                 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
991                 rep.th.doff = arg.iov[0].iov_len / 4;
992 
993                 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
994                                 key->ao_key, key->traffic_key,
995                                 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
996                                 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
997                                 &rep.th, key->sne);
998         }
999 #endif
1000         arg.flags = reply_flags;
1001         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1002                                       ip_hdr(skb)->saddr, /* XXX */
1003                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1004         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1005         if (oif)
1006                 arg.bound_dev_if = oif;
1007         arg.tos = tos;
1008         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1009         local_bh_disable();
1010         local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1011         ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1012         sock_net_set(ctl_sk, net);
1013         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1014                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1015         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1016                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1017         transmit_time = tcp_transmit_time(sk);
1018         ip_send_unicast_reply(ctl_sk,
1019                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
1020                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1021                               &arg, arg.iov[0].iov_len,
1022                               transmit_time, txhash);
1023 
1024         sock_net_set(ctl_sk, &init_net);
1025         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1026         local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1027         local_bh_enable();
1028 }
1029 
1030 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1031 {
1032         struct inet_timewait_sock *tw = inet_twsk(sk);
1033         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1034         struct tcp_key key = {};
1035 #ifdef CONFIG_TCP_AO
1036         struct tcp_ao_info *ao_info;
1037 
1038         if (static_branch_unlikely(&tcp_ao_needed.key)) {
1039                 /* FIXME: the segment to-be-acked is not verified yet */
1040                 ao_info = rcu_dereference(tcptw->ao_info);
1041                 if (ao_info) {
1042                         const struct tcp_ao_hdr *aoh;
1043 
1044                         if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1045                                 inet_twsk_put(tw);
1046                                 return;
1047                         }
1048 
1049                         if (aoh)
1050                                 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1051                 }
1052         }
1053         if (key.ao_key) {
1054                 struct tcp_ao_key *rnext_key;
1055 
1056                 key.traffic_key = snd_other_key(key.ao_key);
1057                 key.sne = READ_ONCE(ao_info->snd_sne);
1058                 rnext_key = READ_ONCE(ao_info->rnext_key);
1059                 key.rcv_next = rnext_key->rcvid;
1060                 key.type = TCP_KEY_AO;
1061 #else
1062         if (0) {
1063 #endif
1064         } else if (static_branch_tcp_md5()) {
1065                 key.md5_key = tcp_twsk_md5_key(tcptw);
1066                 if (key.md5_key)
1067                         key.type = TCP_KEY_MD5;
1068         }
1069 
1070         tcp_v4_send_ack(sk, skb,
1071                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1072                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1073                         tcp_tw_tsval(tcptw),
1074                         READ_ONCE(tcptw->tw_ts_recent),
1075                         tw->tw_bound_dev_if, &key,
1076                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1077                         tw->tw_tos,
1078                         tw->tw_txhash);
1079 
1080         inet_twsk_put(tw);
1081 }
1082 
1083 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1084                                   struct request_sock *req)
1085 {
1086         struct tcp_key key = {};
1087 
1088         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1089          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1090          */
1091         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1092                                              tcp_sk(sk)->snd_nxt;
1093 
1094 #ifdef CONFIG_TCP_AO
1095         if (static_branch_unlikely(&tcp_ao_needed.key) &&
1096             tcp_rsk_used_ao(req)) {
1097                 const union tcp_md5_addr *addr;
1098                 const struct tcp_ao_hdr *aoh;
1099                 int l3index;
1100 
1101                 /* Invalid TCP option size or twice included auth */
1102                 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1103                         return;
1104                 if (!aoh)
1105                         return;
1106 
1107                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1108                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1109                 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1110                                               aoh->rnext_keyid, -1);
1111                 if (unlikely(!key.ao_key)) {
1112                         /* Send ACK with any matching MKT for the peer */
1113                         key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1114                         /* Matching key disappeared (user removed the key?)
1115                          * let the handshake timeout.
1116                          */
1117                         if (!key.ao_key) {
1118                                 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1119                                                      addr,
1120                                                      ntohs(tcp_hdr(skb)->source),
1121                                                      &ip_hdr(skb)->daddr,
1122                                                      ntohs(tcp_hdr(skb)->dest));
1123                                 return;
1124                         }
1125                 }
1126                 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1127                 if (!key.traffic_key)
1128                         return;
1129 
1130                 key.type = TCP_KEY_AO;
1131                 key.rcv_next = aoh->keyid;
1132                 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1133 #else
1134         if (0) {
1135 #endif
1136         } else if (static_branch_tcp_md5()) {
1137                 const union tcp_md5_addr *addr;
1138                 int l3index;
1139 
1140                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1141                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1142                 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1143                 if (key.md5_key)
1144                         key.type = TCP_KEY_MD5;
1145         }
1146 
1147         tcp_v4_send_ack(sk, skb, seq,
1148                         tcp_rsk(req)->rcv_nxt,
1149                         tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1150                         tcp_rsk_tsval(tcp_rsk(req)),
1151                         READ_ONCE(req->ts_recent),
1152                         0, &key,
1153                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1154                         ip_hdr(skb)->tos,
1155                         READ_ONCE(tcp_rsk(req)->txhash));
1156         if (tcp_key_is_ao(&key))
1157                 kfree(key.traffic_key);
1158 }
1159 
1160 /*
1161  *      Send a SYN-ACK after having received a SYN.
1162  *      This still operates on a request_sock only, not on a big
1163  *      socket.
1164  */
1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1166                               struct flowi *fl,
1167                               struct request_sock *req,
1168                               struct tcp_fastopen_cookie *foc,
1169                               enum tcp_synack_type synack_type,
1170                               struct sk_buff *syn_skb)
1171 {
1172         const struct inet_request_sock *ireq = inet_rsk(req);
1173         struct flowi4 fl4;
1174         int err = -1;
1175         struct sk_buff *skb;
1176         u8 tos;
1177 
1178         /* First, grab a route. */
1179         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1180                 return -1;
1181 
1182         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1183 
1184         if (skb) {
1185                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1186 
1187                 tos = READ_ONCE(inet_sk(sk)->tos);
1188 
1189                 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1190                         tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1191                               (tos & INET_ECN_MASK);
1192 
1193                 if (!INET_ECN_is_capable(tos) &&
1194                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1195                         tos |= INET_ECN_ECT_0;
1196 
1197                 rcu_read_lock();
1198                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1199                                             ireq->ir_rmt_addr,
1200                                             rcu_dereference(ireq->ireq_opt),
1201                                             tos);
1202                 rcu_read_unlock();
1203                 err = net_xmit_eval(err);
1204         }
1205 
1206         return err;
1207 }
1208 
1209 /*
1210  *      IPv4 request_sock destructor.
1211  */
1212 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1213 {
1214         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1215 }
1216 
1217 #ifdef CONFIG_TCP_MD5SIG
1218 /*
1219  * RFC2385 MD5 checksumming requires a mapping of
1220  * IP address->MD5 Key.
1221  * We need to maintain these in the sk structure.
1222  */
1223 
1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1225 EXPORT_SYMBOL(tcp_md5_needed);
1226 
1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1228 {
1229         if (!old)
1230                 return true;
1231 
1232         /* l3index always overrides non-l3index */
1233         if (old->l3index && new->l3index == 0)
1234                 return false;
1235         if (old->l3index == 0 && new->l3index)
1236                 return true;
1237 
1238         return old->prefixlen < new->prefixlen;
1239 }
1240 
1241 /* Find the Key structure for an address.  */
1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1243                                            const union tcp_md5_addr *addr,
1244                                            int family, bool any_l3index)
1245 {
1246         const struct tcp_sock *tp = tcp_sk(sk);
1247         struct tcp_md5sig_key *key;
1248         const struct tcp_md5sig_info *md5sig;
1249         __be32 mask;
1250         struct tcp_md5sig_key *best_match = NULL;
1251         bool match;
1252 
1253         /* caller either holds rcu_read_lock() or socket lock */
1254         md5sig = rcu_dereference_check(tp->md5sig_info,
1255                                        lockdep_sock_is_held(sk));
1256         if (!md5sig)
1257                 return NULL;
1258 
1259         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1260                                  lockdep_sock_is_held(sk)) {
1261                 if (key->family != family)
1262                         continue;
1263                 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1264                     key->l3index != l3index)
1265                         continue;
1266                 if (family == AF_INET) {
1267                         mask = inet_make_mask(key->prefixlen);
1268                         match = (key->addr.a4.s_addr & mask) ==
1269                                 (addr->a4.s_addr & mask);
1270 #if IS_ENABLED(CONFIG_IPV6)
1271                 } else if (family == AF_INET6) {
1272                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1273                                                   key->prefixlen);
1274 #endif
1275                 } else {
1276                         match = false;
1277                 }
1278 
1279                 if (match && better_md5_match(best_match, key))
1280                         best_match = key;
1281         }
1282         return best_match;
1283 }
1284 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1285 
1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1287                                                       const union tcp_md5_addr *addr,
1288                                                       int family, u8 prefixlen,
1289                                                       int l3index, u8 flags)
1290 {
1291         const struct tcp_sock *tp = tcp_sk(sk);
1292         struct tcp_md5sig_key *key;
1293         unsigned int size = sizeof(struct in_addr);
1294         const struct tcp_md5sig_info *md5sig;
1295 
1296         /* caller either holds rcu_read_lock() or socket lock */
1297         md5sig = rcu_dereference_check(tp->md5sig_info,
1298                                        lockdep_sock_is_held(sk));
1299         if (!md5sig)
1300                 return NULL;
1301 #if IS_ENABLED(CONFIG_IPV6)
1302         if (family == AF_INET6)
1303                 size = sizeof(struct in6_addr);
1304 #endif
1305         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1306                                  lockdep_sock_is_held(sk)) {
1307                 if (key->family != family)
1308                         continue;
1309                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1310                         continue;
1311                 if (key->l3index != l3index)
1312                         continue;
1313                 if (!memcmp(&key->addr, addr, size) &&
1314                     key->prefixlen == prefixlen)
1315                         return key;
1316         }
1317         return NULL;
1318 }
1319 
1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1321                                          const struct sock *addr_sk)
1322 {
1323         const union tcp_md5_addr *addr;
1324         int l3index;
1325 
1326         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1327                                                  addr_sk->sk_bound_dev_if);
1328         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1329         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1330 }
1331 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1332 
1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1334 {
1335         struct tcp_sock *tp = tcp_sk(sk);
1336         struct tcp_md5sig_info *md5sig;
1337 
1338         md5sig = kmalloc(sizeof(*md5sig), gfp);
1339         if (!md5sig)
1340                 return -ENOMEM;
1341 
1342         sk_gso_disable(sk);
1343         INIT_HLIST_HEAD(&md5sig->head);
1344         rcu_assign_pointer(tp->md5sig_info, md5sig);
1345         return 0;
1346 }
1347 
1348 /* This can be called on a newly created socket, from other files */
1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1350                             int family, u8 prefixlen, int l3index, u8 flags,
1351                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1352 {
1353         /* Add Key to the list */
1354         struct tcp_md5sig_key *key;
1355         struct tcp_sock *tp = tcp_sk(sk);
1356         struct tcp_md5sig_info *md5sig;
1357 
1358         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1359         if (key) {
1360                 /* Pre-existing entry - just update that one.
1361                  * Note that the key might be used concurrently.
1362                  * data_race() is telling kcsan that we do not care of
1363                  * key mismatches, since changing MD5 key on live flows
1364                  * can lead to packet drops.
1365                  */
1366                 data_race(memcpy(key->key, newkey, newkeylen));
1367 
1368                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1369                  * Also note that a reader could catch new key->keylen value
1370                  * but old key->key[], this is the reason we use __GFP_ZERO
1371                  * at sock_kmalloc() time below these lines.
1372                  */
1373                 WRITE_ONCE(key->keylen, newkeylen);
1374 
1375                 return 0;
1376         }
1377 
1378         md5sig = rcu_dereference_protected(tp->md5sig_info,
1379                                            lockdep_sock_is_held(sk));
1380 
1381         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1382         if (!key)
1383                 return -ENOMEM;
1384 
1385         memcpy(key->key, newkey, newkeylen);
1386         key->keylen = newkeylen;
1387         key->family = family;
1388         key->prefixlen = prefixlen;
1389         key->l3index = l3index;
1390         key->flags = flags;
1391         memcpy(&key->addr, addr,
1392                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1393                                                                  sizeof(struct in_addr));
1394         hlist_add_head_rcu(&key->node, &md5sig->head);
1395         return 0;
1396 }
1397 
1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1399                    int family, u8 prefixlen, int l3index, u8 flags,
1400                    const u8 *newkey, u8 newkeylen)
1401 {
1402         struct tcp_sock *tp = tcp_sk(sk);
1403 
1404         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1405                 if (tcp_md5_alloc_sigpool())
1406                         return -ENOMEM;
1407 
1408                 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1409                         tcp_md5_release_sigpool();
1410                         return -ENOMEM;
1411                 }
1412 
1413                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1414                         struct tcp_md5sig_info *md5sig;
1415 
1416                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1417                         rcu_assign_pointer(tp->md5sig_info, NULL);
1418                         kfree_rcu(md5sig, rcu);
1419                         tcp_md5_release_sigpool();
1420                         return -EUSERS;
1421                 }
1422         }
1423 
1424         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1425                                 newkey, newkeylen, GFP_KERNEL);
1426 }
1427 EXPORT_SYMBOL(tcp_md5_do_add);
1428 
1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1430                      int family, u8 prefixlen, int l3index,
1431                      struct tcp_md5sig_key *key)
1432 {
1433         struct tcp_sock *tp = tcp_sk(sk);
1434 
1435         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1436                 tcp_md5_add_sigpool();
1437 
1438                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1439                         tcp_md5_release_sigpool();
1440                         return -ENOMEM;
1441                 }
1442 
1443                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1444                         struct tcp_md5sig_info *md5sig;
1445 
1446                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1447                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1448                         rcu_assign_pointer(tp->md5sig_info, NULL);
1449                         kfree_rcu(md5sig, rcu);
1450                         tcp_md5_release_sigpool();
1451                         return -EUSERS;
1452                 }
1453         }
1454 
1455         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1456                                 key->flags, key->key, key->keylen,
1457                                 sk_gfp_mask(sk, GFP_ATOMIC));
1458 }
1459 EXPORT_SYMBOL(tcp_md5_key_copy);
1460 
1461 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1462                    u8 prefixlen, int l3index, u8 flags)
1463 {
1464         struct tcp_md5sig_key *key;
1465 
1466         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1467         if (!key)
1468                 return -ENOENT;
1469         hlist_del_rcu(&key->node);
1470         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1471         kfree_rcu(key, rcu);
1472         return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_md5_do_del);
1475 
1476 void tcp_clear_md5_list(struct sock *sk)
1477 {
1478         struct tcp_sock *tp = tcp_sk(sk);
1479         struct tcp_md5sig_key *key;
1480         struct hlist_node *n;
1481         struct tcp_md5sig_info *md5sig;
1482 
1483         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1484 
1485         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1486                 hlist_del_rcu(&key->node);
1487                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1488                 kfree_rcu(key, rcu);
1489         }
1490 }
1491 
1492 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1493                                  sockptr_t optval, int optlen)
1494 {
1495         struct tcp_md5sig cmd;
1496         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1497         const union tcp_md5_addr *addr;
1498         u8 prefixlen = 32;
1499         int l3index = 0;
1500         bool l3flag;
1501         u8 flags;
1502 
1503         if (optlen < sizeof(cmd))
1504                 return -EINVAL;
1505 
1506         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1507                 return -EFAULT;
1508 
1509         if (sin->sin_family != AF_INET)
1510                 return -EINVAL;
1511 
1512         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1513         l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1514 
1515         if (optname == TCP_MD5SIG_EXT &&
1516             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1517                 prefixlen = cmd.tcpm_prefixlen;
1518                 if (prefixlen > 32)
1519                         return -EINVAL;
1520         }
1521 
1522         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1523             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1524                 struct net_device *dev;
1525 
1526                 rcu_read_lock();
1527                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1528                 if (dev && netif_is_l3_master(dev))
1529                         l3index = dev->ifindex;
1530 
1531                 rcu_read_unlock();
1532 
1533                 /* ok to reference set/not set outside of rcu;
1534                  * right now device MUST be an L3 master
1535                  */
1536                 if (!dev || !l3index)
1537                         return -EINVAL;
1538         }
1539 
1540         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1541 
1542         if (!cmd.tcpm_keylen)
1543                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1544 
1545         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1546                 return -EINVAL;
1547 
1548         /* Don't allow keys for peers that have a matching TCP-AO key.
1549          * See the comment in tcp_ao_add_cmd()
1550          */
1551         if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1552                 return -EKEYREJECTED;
1553 
1554         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1555                               cmd.tcpm_key, cmd.tcpm_keylen);
1556 }
1557 
1558 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1559                                    __be32 daddr, __be32 saddr,
1560                                    const struct tcphdr *th, int nbytes)
1561 {
1562         struct tcp4_pseudohdr *bp;
1563         struct scatterlist sg;
1564         struct tcphdr *_th;
1565 
1566         bp = hp->scratch;
1567         bp->saddr = saddr;
1568         bp->daddr = daddr;
1569         bp->pad = 0;
1570         bp->protocol = IPPROTO_TCP;
1571         bp->len = cpu_to_be16(nbytes);
1572 
1573         _th = (struct tcphdr *)(bp + 1);
1574         memcpy(_th, th, sizeof(*th));
1575         _th->check = 0;
1576 
1577         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1578         ahash_request_set_crypt(hp->req, &sg, NULL,
1579                                 sizeof(*bp) + sizeof(*th));
1580         return crypto_ahash_update(hp->req);
1581 }
1582 
1583 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1584                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1585 {
1586         struct tcp_sigpool hp;
1587 
1588         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1589                 goto clear_hash_nostart;
1590 
1591         if (crypto_ahash_init(hp.req))
1592                 goto clear_hash;
1593         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1594                 goto clear_hash;
1595         if (tcp_md5_hash_key(&hp, key))
1596                 goto clear_hash;
1597         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1598         if (crypto_ahash_final(hp.req))
1599                 goto clear_hash;
1600 
1601         tcp_sigpool_end(&hp);
1602         return 0;
1603 
1604 clear_hash:
1605         tcp_sigpool_end(&hp);
1606 clear_hash_nostart:
1607         memset(md5_hash, 0, 16);
1608         return 1;
1609 }
1610 
1611 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1612                         const struct sock *sk,
1613                         const struct sk_buff *skb)
1614 {
1615         const struct tcphdr *th = tcp_hdr(skb);
1616         struct tcp_sigpool hp;
1617         __be32 saddr, daddr;
1618 
1619         if (sk) { /* valid for establish/request sockets */
1620                 saddr = sk->sk_rcv_saddr;
1621                 daddr = sk->sk_daddr;
1622         } else {
1623                 const struct iphdr *iph = ip_hdr(skb);
1624                 saddr = iph->saddr;
1625                 daddr = iph->daddr;
1626         }
1627 
1628         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1629                 goto clear_hash_nostart;
1630 
1631         if (crypto_ahash_init(hp.req))
1632                 goto clear_hash;
1633 
1634         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1635                 goto clear_hash;
1636         if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1637                 goto clear_hash;
1638         if (tcp_md5_hash_key(&hp, key))
1639                 goto clear_hash;
1640         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1641         if (crypto_ahash_final(hp.req))
1642                 goto clear_hash;
1643 
1644         tcp_sigpool_end(&hp);
1645         return 0;
1646 
1647 clear_hash:
1648         tcp_sigpool_end(&hp);
1649 clear_hash_nostart:
1650         memset(md5_hash, 0, 16);
1651         return 1;
1652 }
1653 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1654 
1655 #endif
1656 
1657 static void tcp_v4_init_req(struct request_sock *req,
1658                             const struct sock *sk_listener,
1659                             struct sk_buff *skb)
1660 {
1661         struct inet_request_sock *ireq = inet_rsk(req);
1662         struct net *net = sock_net(sk_listener);
1663 
1664         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1665         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1666         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1667 }
1668 
1669 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1670                                           struct sk_buff *skb,
1671                                           struct flowi *fl,
1672                                           struct request_sock *req,
1673                                           u32 tw_isn)
1674 {
1675         tcp_v4_init_req(req, sk, skb);
1676 
1677         if (security_inet_conn_request(sk, skb, req))
1678                 return NULL;
1679 
1680         return inet_csk_route_req(sk, &fl->u.ip4, req);
1681 }
1682 
1683 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1684         .family         =       PF_INET,
1685         .obj_size       =       sizeof(struct tcp_request_sock),
1686         .rtx_syn_ack    =       tcp_rtx_synack,
1687         .send_ack       =       tcp_v4_reqsk_send_ack,
1688         .destructor     =       tcp_v4_reqsk_destructor,
1689         .send_reset     =       tcp_v4_send_reset,
1690         .syn_ack_timeout =      tcp_syn_ack_timeout,
1691 };
1692 
1693 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1694         .mss_clamp      =       TCP_MSS_DEFAULT,
1695 #ifdef CONFIG_TCP_MD5SIG
1696         .req_md5_lookup =       tcp_v4_md5_lookup,
1697         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1698 #endif
1699 #ifdef CONFIG_TCP_AO
1700         .ao_lookup      =       tcp_v4_ao_lookup_rsk,
1701         .ao_calc_key    =       tcp_v4_ao_calc_key_rsk,
1702         .ao_synack_hash =       tcp_v4_ao_synack_hash,
1703 #endif
1704 #ifdef CONFIG_SYN_COOKIES
1705         .cookie_init_seq =      cookie_v4_init_sequence,
1706 #endif
1707         .route_req      =       tcp_v4_route_req,
1708         .init_seq       =       tcp_v4_init_seq,
1709         .init_ts_off    =       tcp_v4_init_ts_off,
1710         .send_synack    =       tcp_v4_send_synack,
1711 };
1712 
1713 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1714 {
1715         /* Never answer to SYNs send to broadcast or multicast */
1716         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1717                 goto drop;
1718 
1719         return tcp_conn_request(&tcp_request_sock_ops,
1720                                 &tcp_request_sock_ipv4_ops, sk, skb);
1721 
1722 drop:
1723         tcp_listendrop(sk);
1724         return 0;
1725 }
1726 EXPORT_SYMBOL(tcp_v4_conn_request);
1727 
1728 
1729 /*
1730  * The three way handshake has completed - we got a valid synack -
1731  * now create the new socket.
1732  */
1733 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1734                                   struct request_sock *req,
1735                                   struct dst_entry *dst,
1736                                   struct request_sock *req_unhash,
1737                                   bool *own_req)
1738 {
1739         struct inet_request_sock *ireq;
1740         bool found_dup_sk = false;
1741         struct inet_sock *newinet;
1742         struct tcp_sock *newtp;
1743         struct sock *newsk;
1744 #ifdef CONFIG_TCP_MD5SIG
1745         const union tcp_md5_addr *addr;
1746         struct tcp_md5sig_key *key;
1747         int l3index;
1748 #endif
1749         struct ip_options_rcu *inet_opt;
1750 
1751         if (sk_acceptq_is_full(sk))
1752                 goto exit_overflow;
1753 
1754         newsk = tcp_create_openreq_child(sk, req, skb);
1755         if (!newsk)
1756                 goto exit_nonewsk;
1757 
1758         newsk->sk_gso_type = SKB_GSO_TCPV4;
1759         inet_sk_rx_dst_set(newsk, skb);
1760 
1761         newtp                 = tcp_sk(newsk);
1762         newinet               = inet_sk(newsk);
1763         ireq                  = inet_rsk(req);
1764         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1765         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1766         newsk->sk_bound_dev_if = ireq->ir_iif;
1767         newinet->inet_saddr   = ireq->ir_loc_addr;
1768         inet_opt              = rcu_dereference(ireq->ireq_opt);
1769         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1770         newinet->mc_index     = inet_iif(skb);
1771         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1772         newinet->rcv_tos      = ip_hdr(skb)->tos;
1773         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1774         if (inet_opt)
1775                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1776         atomic_set(&newinet->inet_id, get_random_u16());
1777 
1778         /* Set ToS of the new socket based upon the value of incoming SYN.
1779          * ECT bits are set later in tcp_init_transfer().
1780          */
1781         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1782                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1783 
1784         if (!dst) {
1785                 dst = inet_csk_route_child_sock(sk, newsk, req);
1786                 if (!dst)
1787                         goto put_and_exit;
1788         } else {
1789                 /* syncookie case : see end of cookie_v4_check() */
1790         }
1791         sk_setup_caps(newsk, dst);
1792 
1793         tcp_ca_openreq_child(newsk, dst);
1794 
1795         tcp_sync_mss(newsk, dst_mtu(dst));
1796         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1797 
1798         tcp_initialize_rcv_mss(newsk);
1799 
1800 #ifdef CONFIG_TCP_MD5SIG
1801         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1802         /* Copy over the MD5 key from the original socket */
1803         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1804         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1805         if (key && !tcp_rsk_used_ao(req)) {
1806                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1807                         goto put_and_exit;
1808                 sk_gso_disable(newsk);
1809         }
1810 #endif
1811 #ifdef CONFIG_TCP_AO
1812         if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1813                 goto put_and_exit; /* OOM, release back memory */
1814 #endif
1815 
1816         if (__inet_inherit_port(sk, newsk) < 0)
1817                 goto put_and_exit;
1818         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1819                                        &found_dup_sk);
1820         if (likely(*own_req)) {
1821                 tcp_move_syn(newtp, req);
1822                 ireq->ireq_opt = NULL;
1823         } else {
1824                 newinet->inet_opt = NULL;
1825 
1826                 if (!req_unhash && found_dup_sk) {
1827                         /* This code path should only be executed in the
1828                          * syncookie case only
1829                          */
1830                         bh_unlock_sock(newsk);
1831                         sock_put(newsk);
1832                         newsk = NULL;
1833                 }
1834         }
1835         return newsk;
1836 
1837 exit_overflow:
1838         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1839 exit_nonewsk:
1840         dst_release(dst);
1841 exit:
1842         tcp_listendrop(sk);
1843         return NULL;
1844 put_and_exit:
1845         newinet->inet_opt = NULL;
1846         inet_csk_prepare_forced_close(newsk);
1847         tcp_done(newsk);
1848         goto exit;
1849 }
1850 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1851 
1852 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1853 {
1854 #ifdef CONFIG_SYN_COOKIES
1855         const struct tcphdr *th = tcp_hdr(skb);
1856 
1857         if (!th->syn)
1858                 sk = cookie_v4_check(sk, skb);
1859 #endif
1860         return sk;
1861 }
1862 
1863 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1864                          struct tcphdr *th, u32 *cookie)
1865 {
1866         u16 mss = 0;
1867 #ifdef CONFIG_SYN_COOKIES
1868         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1869                                     &tcp_request_sock_ipv4_ops, sk, th);
1870         if (mss) {
1871                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1872                 tcp_synq_overflow(sk);
1873         }
1874 #endif
1875         return mss;
1876 }
1877 
1878 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1879                                                            u32));
1880 /* The socket must have it's spinlock held when we get
1881  * here, unless it is a TCP_LISTEN socket.
1882  *
1883  * We have a potential double-lock case here, so even when
1884  * doing backlog processing we use the BH locking scheme.
1885  * This is because we cannot sleep with the original spinlock
1886  * held.
1887  */
1888 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1889 {
1890         enum skb_drop_reason reason;
1891         struct sock *rsk;
1892 
1893         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1894                 struct dst_entry *dst;
1895 
1896                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1897                                                 lockdep_sock_is_held(sk));
1898 
1899                 sock_rps_save_rxhash(sk, skb);
1900                 sk_mark_napi_id(sk, skb);
1901                 if (dst) {
1902                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1903                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1904                                              dst, 0)) {
1905                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1906                                 dst_release(dst);
1907                         }
1908                 }
1909                 tcp_rcv_established(sk, skb);
1910                 return 0;
1911         }
1912 
1913         if (tcp_checksum_complete(skb))
1914                 goto csum_err;
1915 
1916         if (sk->sk_state == TCP_LISTEN) {
1917                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1918 
1919                 if (!nsk)
1920                         return 0;
1921                 if (nsk != sk) {
1922                         reason = tcp_child_process(sk, nsk, skb);
1923                         if (reason) {
1924                                 rsk = nsk;
1925                                 goto reset;
1926                         }
1927                         return 0;
1928                 }
1929         } else
1930                 sock_rps_save_rxhash(sk, skb);
1931 
1932         reason = tcp_rcv_state_process(sk, skb);
1933         if (reason) {
1934                 rsk = sk;
1935                 goto reset;
1936         }
1937         return 0;
1938 
1939 reset:
1940         tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1941 discard:
1942         sk_skb_reason_drop(sk, skb, reason);
1943         /* Be careful here. If this function gets more complicated and
1944          * gcc suffers from register pressure on the x86, sk (in %ebx)
1945          * might be destroyed here. This current version compiles correctly,
1946          * but you have been warned.
1947          */
1948         return 0;
1949 
1950 csum_err:
1951         reason = SKB_DROP_REASON_TCP_CSUM;
1952         trace_tcp_bad_csum(skb);
1953         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1954         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1955         goto discard;
1956 }
1957 EXPORT_SYMBOL(tcp_v4_do_rcv);
1958 
1959 int tcp_v4_early_demux(struct sk_buff *skb)
1960 {
1961         struct net *net = dev_net(skb->dev);
1962         const struct iphdr *iph;
1963         const struct tcphdr *th;
1964         struct sock *sk;
1965 
1966         if (skb->pkt_type != PACKET_HOST)
1967                 return 0;
1968 
1969         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1970                 return 0;
1971 
1972         iph = ip_hdr(skb);
1973         th = tcp_hdr(skb);
1974 
1975         if (th->doff < sizeof(struct tcphdr) / 4)
1976                 return 0;
1977 
1978         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1979                                        iph->saddr, th->source,
1980                                        iph->daddr, ntohs(th->dest),
1981                                        skb->skb_iif, inet_sdif(skb));
1982         if (sk) {
1983                 skb->sk = sk;
1984                 skb->destructor = sock_edemux;
1985                 if (sk_fullsock(sk)) {
1986                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1987 
1988                         if (dst)
1989                                 dst = dst_check(dst, 0);
1990                         if (dst &&
1991                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1992                                 skb_dst_set_noref(skb, dst);
1993                 }
1994         }
1995         return 0;
1996 }
1997 
1998 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1999                      enum skb_drop_reason *reason)
2000 {
2001         u32 tail_gso_size, tail_gso_segs;
2002         struct skb_shared_info *shinfo;
2003         const struct tcphdr *th;
2004         struct tcphdr *thtail;
2005         struct sk_buff *tail;
2006         unsigned int hdrlen;
2007         bool fragstolen;
2008         u32 gso_segs;
2009         u32 gso_size;
2010         u64 limit;
2011         int delta;
2012 
2013         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2014          * we can fix skb->truesize to its real value to avoid future drops.
2015          * This is valid because skb is not yet charged to the socket.
2016          * It has been noticed pure SACK packets were sometimes dropped
2017          * (if cooked by drivers without copybreak feature).
2018          */
2019         skb_condense(skb);
2020 
2021         skb_dst_drop(skb);
2022 
2023         if (unlikely(tcp_checksum_complete(skb))) {
2024                 bh_unlock_sock(sk);
2025                 trace_tcp_bad_csum(skb);
2026                 *reason = SKB_DROP_REASON_TCP_CSUM;
2027                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2028                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2029                 return true;
2030         }
2031 
2032         /* Attempt coalescing to last skb in backlog, even if we are
2033          * above the limits.
2034          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2035          */
2036         th = (const struct tcphdr *)skb->data;
2037         hdrlen = th->doff * 4;
2038 
2039         tail = sk->sk_backlog.tail;
2040         if (!tail)
2041                 goto no_coalesce;
2042         thtail = (struct tcphdr *)tail->data;
2043 
2044         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2045             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2046             ((TCP_SKB_CB(tail)->tcp_flags |
2047               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2048             !((TCP_SKB_CB(tail)->tcp_flags &
2049               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2050             ((TCP_SKB_CB(tail)->tcp_flags ^
2051               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2052             !tcp_skb_can_collapse_rx(tail, skb) ||
2053             thtail->doff != th->doff ||
2054             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2055                 goto no_coalesce;
2056 
2057         __skb_pull(skb, hdrlen);
2058 
2059         shinfo = skb_shinfo(skb);
2060         gso_size = shinfo->gso_size ?: skb->len;
2061         gso_segs = shinfo->gso_segs ?: 1;
2062 
2063         shinfo = skb_shinfo(tail);
2064         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2065         tail_gso_segs = shinfo->gso_segs ?: 1;
2066 
2067         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2068                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2069 
2070                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2071                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2072                         thtail->window = th->window;
2073                 }
2074 
2075                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2076                  * thtail->fin, so that the fast path in tcp_rcv_established()
2077                  * is not entered if we append a packet with a FIN.
2078                  * SYN, RST, URG are not present.
2079                  * ACK is set on both packets.
2080                  * PSH : we do not really care in TCP stack,
2081                  *       at least for 'GRO' packets.
2082                  */
2083                 thtail->fin |= th->fin;
2084                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2085 
2086                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2087                         TCP_SKB_CB(tail)->has_rxtstamp = true;
2088                         tail->tstamp = skb->tstamp;
2089                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2090                 }
2091 
2092                 /* Not as strict as GRO. We only need to carry mss max value */
2093                 shinfo->gso_size = max(gso_size, tail_gso_size);
2094                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2095 
2096                 sk->sk_backlog.len += delta;
2097                 __NET_INC_STATS(sock_net(sk),
2098                                 LINUX_MIB_TCPBACKLOGCOALESCE);
2099                 kfree_skb_partial(skb, fragstolen);
2100                 return false;
2101         }
2102         __skb_push(skb, hdrlen);
2103 
2104 no_coalesce:
2105         /* sk->sk_backlog.len is reset only at the end of __release_sock().
2106          * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2107          * sk_rcvbuf in normal conditions.
2108          */
2109         limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2110 
2111         limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2112 
2113         /* Only socket owner can try to collapse/prune rx queues
2114          * to reduce memory overhead, so add a little headroom here.
2115          * Few sockets backlog are possibly concurrently non empty.
2116          */
2117         limit += 64 * 1024;
2118 
2119         limit = min_t(u64, limit, UINT_MAX);
2120 
2121         if (unlikely(sk_add_backlog(sk, skb, limit))) {
2122                 bh_unlock_sock(sk);
2123                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2124                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2125                 return true;
2126         }
2127         return false;
2128 }
2129 EXPORT_SYMBOL(tcp_add_backlog);
2130 
2131 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2132 {
2133         struct tcphdr *th = (struct tcphdr *)skb->data;
2134 
2135         return sk_filter_trim_cap(sk, skb, th->doff * 4);
2136 }
2137 EXPORT_SYMBOL(tcp_filter);
2138 
2139 static void tcp_v4_restore_cb(struct sk_buff *skb)
2140 {
2141         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2142                 sizeof(struct inet_skb_parm));
2143 }
2144 
2145 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2146                            const struct tcphdr *th)
2147 {
2148         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2149          * barrier() makes sure compiler wont play fool^Waliasing games.
2150          */
2151         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2152                 sizeof(struct inet_skb_parm));
2153         barrier();
2154 
2155         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2156         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2157                                     skb->len - th->doff * 4);
2158         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2159         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2160         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2161         TCP_SKB_CB(skb)->sacked  = 0;
2162         TCP_SKB_CB(skb)->has_rxtstamp =
2163                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2164 }
2165 
2166 /*
2167  *      From tcp_input.c
2168  */
2169 
2170 int tcp_v4_rcv(struct sk_buff *skb)
2171 {
2172         struct net *net = dev_net(skb->dev);
2173         enum skb_drop_reason drop_reason;
2174         int sdif = inet_sdif(skb);
2175         int dif = inet_iif(skb);
2176         const struct iphdr *iph;
2177         const struct tcphdr *th;
2178         struct sock *sk = NULL;
2179         bool refcounted;
2180         int ret;
2181         u32 isn;
2182 
2183         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2184         if (skb->pkt_type != PACKET_HOST)
2185                 goto discard_it;
2186 
2187         /* Count it even if it's bad */
2188         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2189 
2190         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2191                 goto discard_it;
2192 
2193         th = (const struct tcphdr *)skb->data;
2194 
2195         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2196                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2197                 goto bad_packet;
2198         }
2199         if (!pskb_may_pull(skb, th->doff * 4))
2200                 goto discard_it;
2201 
2202         /* An explanation is required here, I think.
2203          * Packet length and doff are validated by header prediction,
2204          * provided case of th->doff==0 is eliminated.
2205          * So, we defer the checks. */
2206 
2207         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2208                 goto csum_error;
2209 
2210         th = (const struct tcphdr *)skb->data;
2211         iph = ip_hdr(skb);
2212 lookup:
2213         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2214                                skb, __tcp_hdrlen(th), th->source,
2215                                th->dest, sdif, &refcounted);
2216         if (!sk)
2217                 goto no_tcp_socket;
2218 
2219         if (sk->sk_state == TCP_TIME_WAIT)
2220                 goto do_time_wait;
2221 
2222         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2223                 struct request_sock *req = inet_reqsk(sk);
2224                 bool req_stolen = false;
2225                 struct sock *nsk;
2226 
2227                 sk = req->rsk_listener;
2228                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2229                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2230                 else
2231                         drop_reason = tcp_inbound_hash(sk, req, skb,
2232                                                        &iph->saddr, &iph->daddr,
2233                                                        AF_INET, dif, sdif);
2234                 if (unlikely(drop_reason)) {
2235                         sk_drops_add(sk, skb);
2236                         reqsk_put(req);
2237                         goto discard_it;
2238                 }
2239                 if (tcp_checksum_complete(skb)) {
2240                         reqsk_put(req);
2241                         goto csum_error;
2242                 }
2243                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2244                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2245                         if (!nsk) {
2246                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2247                                 goto lookup;
2248                         }
2249                         sk = nsk;
2250                         /* reuseport_migrate_sock() has already held one sk_refcnt
2251                          * before returning.
2252                          */
2253                 } else {
2254                         /* We own a reference on the listener, increase it again
2255                          * as we might lose it too soon.
2256                          */
2257                         sock_hold(sk);
2258                 }
2259                 refcounted = true;
2260                 nsk = NULL;
2261                 if (!tcp_filter(sk, skb)) {
2262                         th = (const struct tcphdr *)skb->data;
2263                         iph = ip_hdr(skb);
2264                         tcp_v4_fill_cb(skb, iph, th);
2265                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2266                 } else {
2267                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2268                 }
2269                 if (!nsk) {
2270                         reqsk_put(req);
2271                         if (req_stolen) {
2272                                 /* Another cpu got exclusive access to req
2273                                  * and created a full blown socket.
2274                                  * Try to feed this packet to this socket
2275                                  * instead of discarding it.
2276                                  */
2277                                 tcp_v4_restore_cb(skb);
2278                                 sock_put(sk);
2279                                 goto lookup;
2280                         }
2281                         goto discard_and_relse;
2282                 }
2283                 nf_reset_ct(skb);
2284                 if (nsk == sk) {
2285                         reqsk_put(req);
2286                         tcp_v4_restore_cb(skb);
2287                 } else {
2288                         drop_reason = tcp_child_process(sk, nsk, skb);
2289                         if (drop_reason) {
2290                                 enum sk_rst_reason rst_reason;
2291 
2292                                 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2293                                 tcp_v4_send_reset(nsk, skb, rst_reason);
2294                                 goto discard_and_relse;
2295                         }
2296                         sock_put(sk);
2297                         return 0;
2298                 }
2299         }
2300 
2301 process:
2302         if (static_branch_unlikely(&ip4_min_ttl)) {
2303                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2304                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2305                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2306                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2307                         goto discard_and_relse;
2308                 }
2309         }
2310 
2311         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2312                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2313                 goto discard_and_relse;
2314         }
2315 
2316         drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2317                                        AF_INET, dif, sdif);
2318         if (drop_reason)
2319                 goto discard_and_relse;
2320 
2321         nf_reset_ct(skb);
2322 
2323         if (tcp_filter(sk, skb)) {
2324                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2325                 goto discard_and_relse;
2326         }
2327         th = (const struct tcphdr *)skb->data;
2328         iph = ip_hdr(skb);
2329         tcp_v4_fill_cb(skb, iph, th);
2330 
2331         skb->dev = NULL;
2332 
2333         if (sk->sk_state == TCP_LISTEN) {
2334                 ret = tcp_v4_do_rcv(sk, skb);
2335                 goto put_and_return;
2336         }
2337 
2338         sk_incoming_cpu_update(sk);
2339 
2340         bh_lock_sock_nested(sk);
2341         tcp_segs_in(tcp_sk(sk), skb);
2342         ret = 0;
2343         if (!sock_owned_by_user(sk)) {
2344                 ret = tcp_v4_do_rcv(sk, skb);
2345         } else {
2346                 if (tcp_add_backlog(sk, skb, &drop_reason))
2347                         goto discard_and_relse;
2348         }
2349         bh_unlock_sock(sk);
2350 
2351 put_and_return:
2352         if (refcounted)
2353                 sock_put(sk);
2354 
2355         return ret;
2356 
2357 no_tcp_socket:
2358         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2359         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2360                 goto discard_it;
2361 
2362         tcp_v4_fill_cb(skb, iph, th);
2363 
2364         if (tcp_checksum_complete(skb)) {
2365 csum_error:
2366                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2367                 trace_tcp_bad_csum(skb);
2368                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2369 bad_packet:
2370                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2371         } else {
2372                 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2373         }
2374 
2375 discard_it:
2376         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2377         /* Discard frame. */
2378         sk_skb_reason_drop(sk, skb, drop_reason);
2379         return 0;
2380 
2381 discard_and_relse:
2382         sk_drops_add(sk, skb);
2383         if (refcounted)
2384                 sock_put(sk);
2385         goto discard_it;
2386 
2387 do_time_wait:
2388         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2389                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2390                 inet_twsk_put(inet_twsk(sk));
2391                 goto discard_it;
2392         }
2393 
2394         tcp_v4_fill_cb(skb, iph, th);
2395 
2396         if (tcp_checksum_complete(skb)) {
2397                 inet_twsk_put(inet_twsk(sk));
2398                 goto csum_error;
2399         }
2400         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2401         case TCP_TW_SYN: {
2402                 struct sock *sk2 = inet_lookup_listener(net,
2403                                                         net->ipv4.tcp_death_row.hashinfo,
2404                                                         skb, __tcp_hdrlen(th),
2405                                                         iph->saddr, th->source,
2406                                                         iph->daddr, th->dest,
2407                                                         inet_iif(skb),
2408                                                         sdif);
2409                 if (sk2) {
2410                         inet_twsk_deschedule_put(inet_twsk(sk));
2411                         sk = sk2;
2412                         tcp_v4_restore_cb(skb);
2413                         refcounted = false;
2414                         __this_cpu_write(tcp_tw_isn, isn);
2415                         goto process;
2416                 }
2417         }
2418                 /* to ACK */
2419                 fallthrough;
2420         case TCP_TW_ACK:
2421                 tcp_v4_timewait_ack(sk, skb);
2422                 break;
2423         case TCP_TW_RST:
2424                 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2425                 inet_twsk_deschedule_put(inet_twsk(sk));
2426                 goto discard_it;
2427         case TCP_TW_SUCCESS:;
2428         }
2429         goto discard_it;
2430 }
2431 
2432 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2433         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2434         .twsk_destructor= tcp_twsk_destructor,
2435 };
2436 
2437 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2438 {
2439         struct dst_entry *dst = skb_dst(skb);
2440 
2441         if (dst && dst_hold_safe(dst)) {
2442                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2443                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2444         }
2445 }
2446 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2447 
2448 const struct inet_connection_sock_af_ops ipv4_specific = {
2449         .queue_xmit        = ip_queue_xmit,
2450         .send_check        = tcp_v4_send_check,
2451         .rebuild_header    = inet_sk_rebuild_header,
2452         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2453         .conn_request      = tcp_v4_conn_request,
2454         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2455         .net_header_len    = sizeof(struct iphdr),
2456         .setsockopt        = ip_setsockopt,
2457         .getsockopt        = ip_getsockopt,
2458         .addr2sockaddr     = inet_csk_addr2sockaddr,
2459         .sockaddr_len      = sizeof(struct sockaddr_in),
2460         .mtu_reduced       = tcp_v4_mtu_reduced,
2461 };
2462 EXPORT_SYMBOL(ipv4_specific);
2463 
2464 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2465 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2466 #ifdef CONFIG_TCP_MD5SIG
2467         .md5_lookup             = tcp_v4_md5_lookup,
2468         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2469         .md5_parse              = tcp_v4_parse_md5_keys,
2470 #endif
2471 #ifdef CONFIG_TCP_AO
2472         .ao_lookup              = tcp_v4_ao_lookup,
2473         .calc_ao_hash           = tcp_v4_ao_hash_skb,
2474         .ao_parse               = tcp_v4_parse_ao,
2475         .ao_calc_key_sk         = tcp_v4_ao_calc_key_sk,
2476 #endif
2477 };
2478 #endif
2479 
2480 /* NOTE: A lot of things set to zero explicitly by call to
2481  *       sk_alloc() so need not be done here.
2482  */
2483 static int tcp_v4_init_sock(struct sock *sk)
2484 {
2485         struct inet_connection_sock *icsk = inet_csk(sk);
2486 
2487         tcp_init_sock(sk);
2488 
2489         icsk->icsk_af_ops = &ipv4_specific;
2490 
2491 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2492         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2493 #endif
2494 
2495         return 0;
2496 }
2497 
2498 #ifdef CONFIG_TCP_MD5SIG
2499 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2500 {
2501         struct tcp_md5sig_info *md5sig;
2502 
2503         md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2504         kfree(md5sig);
2505         static_branch_slow_dec_deferred(&tcp_md5_needed);
2506         tcp_md5_release_sigpool();
2507 }
2508 #endif
2509 
2510 void tcp_v4_destroy_sock(struct sock *sk)
2511 {
2512         struct tcp_sock *tp = tcp_sk(sk);
2513 
2514         trace_tcp_destroy_sock(sk);
2515 
2516         tcp_clear_xmit_timers(sk);
2517 
2518         tcp_cleanup_congestion_control(sk);
2519 
2520         tcp_cleanup_ulp(sk);
2521 
2522         /* Cleanup up the write buffer. */
2523         tcp_write_queue_purge(sk);
2524 
2525         /* Check if we want to disable active TFO */
2526         tcp_fastopen_active_disable_ofo_check(sk);
2527 
2528         /* Cleans up our, hopefully empty, out_of_order_queue. */
2529         skb_rbtree_purge(&tp->out_of_order_queue);
2530 
2531 #ifdef CONFIG_TCP_MD5SIG
2532         /* Clean up the MD5 key list, if any */
2533         if (tp->md5sig_info) {
2534                 struct tcp_md5sig_info *md5sig;
2535 
2536                 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2537                 tcp_clear_md5_list(sk);
2538                 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2539                 rcu_assign_pointer(tp->md5sig_info, NULL);
2540         }
2541 #endif
2542         tcp_ao_destroy_sock(sk, false);
2543 
2544         /* Clean up a referenced TCP bind bucket. */
2545         if (inet_csk(sk)->icsk_bind_hash)
2546                 inet_put_port(sk);
2547 
2548         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2549 
2550         /* If socket is aborted during connect operation */
2551         tcp_free_fastopen_req(tp);
2552         tcp_fastopen_destroy_cipher(sk);
2553         tcp_saved_syn_free(tp);
2554 
2555         sk_sockets_allocated_dec(sk);
2556 }
2557 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2558 
2559 #ifdef CONFIG_PROC_FS
2560 /* Proc filesystem TCP sock list dumping. */
2561 
2562 static unsigned short seq_file_family(const struct seq_file *seq);
2563 
2564 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2565 {
2566         unsigned short family = seq_file_family(seq);
2567 
2568         /* AF_UNSPEC is used as a match all */
2569         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2570                 net_eq(sock_net(sk), seq_file_net(seq)));
2571 }
2572 
2573 /* Find a non empty bucket (starting from st->bucket)
2574  * and return the first sk from it.
2575  */
2576 static void *listening_get_first(struct seq_file *seq)
2577 {
2578         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2579         struct tcp_iter_state *st = seq->private;
2580 
2581         st->offset = 0;
2582         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2583                 struct inet_listen_hashbucket *ilb2;
2584                 struct hlist_nulls_node *node;
2585                 struct sock *sk;
2586 
2587                 ilb2 = &hinfo->lhash2[st->bucket];
2588                 if (hlist_nulls_empty(&ilb2->nulls_head))
2589                         continue;
2590 
2591                 spin_lock(&ilb2->lock);
2592                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2593                         if (seq_sk_match(seq, sk))
2594                                 return sk;
2595                 }
2596                 spin_unlock(&ilb2->lock);
2597         }
2598 
2599         return NULL;
2600 }
2601 
2602 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2603  * If "cur" is the last one in the st->bucket,
2604  * call listening_get_first() to return the first sk of the next
2605  * non empty bucket.
2606  */
2607 static void *listening_get_next(struct seq_file *seq, void *cur)
2608 {
2609         struct tcp_iter_state *st = seq->private;
2610         struct inet_listen_hashbucket *ilb2;
2611         struct hlist_nulls_node *node;
2612         struct inet_hashinfo *hinfo;
2613         struct sock *sk = cur;
2614 
2615         ++st->num;
2616         ++st->offset;
2617 
2618         sk = sk_nulls_next(sk);
2619         sk_nulls_for_each_from(sk, node) {
2620                 if (seq_sk_match(seq, sk))
2621                         return sk;
2622         }
2623 
2624         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2625         ilb2 = &hinfo->lhash2[st->bucket];
2626         spin_unlock(&ilb2->lock);
2627         ++st->bucket;
2628         return listening_get_first(seq);
2629 }
2630 
2631 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2632 {
2633         struct tcp_iter_state *st = seq->private;
2634         void *rc;
2635 
2636         st->bucket = 0;
2637         st->offset = 0;
2638         rc = listening_get_first(seq);
2639 
2640         while (rc && *pos) {
2641                 rc = listening_get_next(seq, rc);
2642                 --*pos;
2643         }
2644         return rc;
2645 }
2646 
2647 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2648                                 const struct tcp_iter_state *st)
2649 {
2650         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2651 }
2652 
2653 /*
2654  * Get first established socket starting from bucket given in st->bucket.
2655  * If st->bucket is zero, the very first socket in the hash is returned.
2656  */
2657 static void *established_get_first(struct seq_file *seq)
2658 {
2659         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2660         struct tcp_iter_state *st = seq->private;
2661 
2662         st->offset = 0;
2663         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2664                 struct sock *sk;
2665                 struct hlist_nulls_node *node;
2666                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2667 
2668                 cond_resched();
2669 
2670                 /* Lockless fast path for the common case of empty buckets */
2671                 if (empty_bucket(hinfo, st))
2672                         continue;
2673 
2674                 spin_lock_bh(lock);
2675                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2676                         if (seq_sk_match(seq, sk))
2677                                 return sk;
2678                 }
2679                 spin_unlock_bh(lock);
2680         }
2681 
2682         return NULL;
2683 }
2684 
2685 static void *established_get_next(struct seq_file *seq, void *cur)
2686 {
2687         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2688         struct tcp_iter_state *st = seq->private;
2689         struct hlist_nulls_node *node;
2690         struct sock *sk = cur;
2691 
2692         ++st->num;
2693         ++st->offset;
2694 
2695         sk = sk_nulls_next(sk);
2696 
2697         sk_nulls_for_each_from(sk, node) {
2698                 if (seq_sk_match(seq, sk))
2699                         return sk;
2700         }
2701 
2702         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2703         ++st->bucket;
2704         return established_get_first(seq);
2705 }
2706 
2707 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2708 {
2709         struct tcp_iter_state *st = seq->private;
2710         void *rc;
2711 
2712         st->bucket = 0;
2713         rc = established_get_first(seq);
2714 
2715         while (rc && pos) {
2716                 rc = established_get_next(seq, rc);
2717                 --pos;
2718         }
2719         return rc;
2720 }
2721 
2722 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2723 {
2724         void *rc;
2725         struct tcp_iter_state *st = seq->private;
2726 
2727         st->state = TCP_SEQ_STATE_LISTENING;
2728         rc        = listening_get_idx(seq, &pos);
2729 
2730         if (!rc) {
2731                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2732                 rc        = established_get_idx(seq, pos);
2733         }
2734 
2735         return rc;
2736 }
2737 
2738 static void *tcp_seek_last_pos(struct seq_file *seq)
2739 {
2740         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2741         struct tcp_iter_state *st = seq->private;
2742         int bucket = st->bucket;
2743         int offset = st->offset;
2744         int orig_num = st->num;
2745         void *rc = NULL;
2746 
2747         switch (st->state) {
2748         case TCP_SEQ_STATE_LISTENING:
2749                 if (st->bucket > hinfo->lhash2_mask)
2750                         break;
2751                 rc = listening_get_first(seq);
2752                 while (offset-- && rc && bucket == st->bucket)
2753                         rc = listening_get_next(seq, rc);
2754                 if (rc)
2755                         break;
2756                 st->bucket = 0;
2757                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2758                 fallthrough;
2759         case TCP_SEQ_STATE_ESTABLISHED:
2760                 if (st->bucket > hinfo->ehash_mask)
2761                         break;
2762                 rc = established_get_first(seq);
2763                 while (offset-- && rc && bucket == st->bucket)
2764                         rc = established_get_next(seq, rc);
2765         }
2766 
2767         st->num = orig_num;
2768 
2769         return rc;
2770 }
2771 
2772 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2773 {
2774         struct tcp_iter_state *st = seq->private;
2775         void *rc;
2776 
2777         if (*pos && *pos == st->last_pos) {
2778                 rc = tcp_seek_last_pos(seq);
2779                 if (rc)
2780                         goto out;
2781         }
2782 
2783         st->state = TCP_SEQ_STATE_LISTENING;
2784         st->num = 0;
2785         st->bucket = 0;
2786         st->offset = 0;
2787         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2788 
2789 out:
2790         st->last_pos = *pos;
2791         return rc;
2792 }
2793 EXPORT_SYMBOL(tcp_seq_start);
2794 
2795 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2796 {
2797         struct tcp_iter_state *st = seq->private;
2798         void *rc = NULL;
2799 
2800         if (v == SEQ_START_TOKEN) {
2801                 rc = tcp_get_idx(seq, 0);
2802                 goto out;
2803         }
2804 
2805         switch (st->state) {
2806         case TCP_SEQ_STATE_LISTENING:
2807                 rc = listening_get_next(seq, v);
2808                 if (!rc) {
2809                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2810                         st->bucket = 0;
2811                         st->offset = 0;
2812                         rc        = established_get_first(seq);
2813                 }
2814                 break;
2815         case TCP_SEQ_STATE_ESTABLISHED:
2816                 rc = established_get_next(seq, v);
2817                 break;
2818         }
2819 out:
2820         ++*pos;
2821         st->last_pos = *pos;
2822         return rc;
2823 }
2824 EXPORT_SYMBOL(tcp_seq_next);
2825 
2826 void tcp_seq_stop(struct seq_file *seq, void *v)
2827 {
2828         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2829         struct tcp_iter_state *st = seq->private;
2830 
2831         switch (st->state) {
2832         case TCP_SEQ_STATE_LISTENING:
2833                 if (v != SEQ_START_TOKEN)
2834                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2835                 break;
2836         case TCP_SEQ_STATE_ESTABLISHED:
2837                 if (v)
2838                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2839                 break;
2840         }
2841 }
2842 EXPORT_SYMBOL(tcp_seq_stop);
2843 
2844 static void get_openreq4(const struct request_sock *req,
2845                          struct seq_file *f, int i)
2846 {
2847         const struct inet_request_sock *ireq = inet_rsk(req);
2848         long delta = req->rsk_timer.expires - jiffies;
2849 
2850         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2851                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2852                 i,
2853                 ireq->ir_loc_addr,
2854                 ireq->ir_num,
2855                 ireq->ir_rmt_addr,
2856                 ntohs(ireq->ir_rmt_port),
2857                 TCP_SYN_RECV,
2858                 0, 0, /* could print option size, but that is af dependent. */
2859                 1,    /* timers active (only the expire timer) */
2860                 jiffies_delta_to_clock_t(delta),
2861                 req->num_timeout,
2862                 from_kuid_munged(seq_user_ns(f),
2863                                  sock_i_uid(req->rsk_listener)),
2864                 0,  /* non standard timer */
2865                 0, /* open_requests have no inode */
2866                 0,
2867                 req);
2868 }
2869 
2870 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2871 {
2872         int timer_active;
2873         unsigned long timer_expires;
2874         const struct tcp_sock *tp = tcp_sk(sk);
2875         const struct inet_connection_sock *icsk = inet_csk(sk);
2876         const struct inet_sock *inet = inet_sk(sk);
2877         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2878         __be32 dest = inet->inet_daddr;
2879         __be32 src = inet->inet_rcv_saddr;
2880         __u16 destp = ntohs(inet->inet_dport);
2881         __u16 srcp = ntohs(inet->inet_sport);
2882         int rx_queue;
2883         int state;
2884 
2885         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2886             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2887             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2888                 timer_active    = 1;
2889                 timer_expires   = icsk->icsk_timeout;
2890         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2891                 timer_active    = 4;
2892                 timer_expires   = icsk->icsk_timeout;
2893         } else if (timer_pending(&sk->sk_timer)) {
2894                 timer_active    = 2;
2895                 timer_expires   = sk->sk_timer.expires;
2896         } else {
2897                 timer_active    = 0;
2898                 timer_expires = jiffies;
2899         }
2900 
2901         state = inet_sk_state_load(sk);
2902         if (state == TCP_LISTEN)
2903                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2904         else
2905                 /* Because we don't lock the socket,
2906                  * we might find a transient negative value.
2907                  */
2908                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2909                                       READ_ONCE(tp->copied_seq), 0);
2910 
2911         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2912                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2913                 i, src, srcp, dest, destp, state,
2914                 READ_ONCE(tp->write_seq) - tp->snd_una,
2915                 rx_queue,
2916                 timer_active,
2917                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2918                 icsk->icsk_retransmits,
2919                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2920                 icsk->icsk_probes_out,
2921                 sock_i_ino(sk),
2922                 refcount_read(&sk->sk_refcnt), sk,
2923                 jiffies_to_clock_t(icsk->icsk_rto),
2924                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2925                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2926                 tcp_snd_cwnd(tp),
2927                 state == TCP_LISTEN ?
2928                     fastopenq->max_qlen :
2929                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2930 }
2931 
2932 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2933                                struct seq_file *f, int i)
2934 {
2935         long delta = tw->tw_timer.expires - jiffies;
2936         __be32 dest, src;
2937         __u16 destp, srcp;
2938 
2939         dest  = tw->tw_daddr;
2940         src   = tw->tw_rcv_saddr;
2941         destp = ntohs(tw->tw_dport);
2942         srcp  = ntohs(tw->tw_sport);
2943 
2944         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2945                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2946                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2947                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2948                 refcount_read(&tw->tw_refcnt), tw);
2949 }
2950 
2951 #define TMPSZ 150
2952 
2953 static int tcp4_seq_show(struct seq_file *seq, void *v)
2954 {
2955         struct tcp_iter_state *st;
2956         struct sock *sk = v;
2957 
2958         seq_setwidth(seq, TMPSZ - 1);
2959         if (v == SEQ_START_TOKEN) {
2960                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2961                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2962                            "inode");
2963                 goto out;
2964         }
2965         st = seq->private;
2966 
2967         if (sk->sk_state == TCP_TIME_WAIT)
2968                 get_timewait4_sock(v, seq, st->num);
2969         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2970                 get_openreq4(v, seq, st->num);
2971         else
2972                 get_tcp4_sock(v, seq, st->num);
2973 out:
2974         seq_pad(seq, '\n');
2975         return 0;
2976 }
2977 
2978 #ifdef CONFIG_BPF_SYSCALL
2979 struct bpf_tcp_iter_state {
2980         struct tcp_iter_state state;
2981         unsigned int cur_sk;
2982         unsigned int end_sk;
2983         unsigned int max_sk;
2984         struct sock **batch;
2985         bool st_bucket_done;
2986 };
2987 
2988 struct bpf_iter__tcp {
2989         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2990         __bpf_md_ptr(struct sock_common *, sk_common);
2991         uid_t uid __aligned(8);
2992 };
2993 
2994 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2995                              struct sock_common *sk_common, uid_t uid)
2996 {
2997         struct bpf_iter__tcp ctx;
2998 
2999         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3000         ctx.meta = meta;
3001         ctx.sk_common = sk_common;
3002         ctx.uid = uid;
3003         return bpf_iter_run_prog(prog, &ctx);
3004 }
3005 
3006 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3007 {
3008         while (iter->cur_sk < iter->end_sk)
3009                 sock_gen_put(iter->batch[iter->cur_sk++]);
3010 }
3011 
3012 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3013                                       unsigned int new_batch_sz)
3014 {
3015         struct sock **new_batch;
3016 
3017         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3018                              GFP_USER | __GFP_NOWARN);
3019         if (!new_batch)
3020                 return -ENOMEM;
3021 
3022         bpf_iter_tcp_put_batch(iter);
3023         kvfree(iter->batch);
3024         iter->batch = new_batch;
3025         iter->max_sk = new_batch_sz;
3026 
3027         return 0;
3028 }
3029 
3030 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3031                                                  struct sock *start_sk)
3032 {
3033         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3034         struct bpf_tcp_iter_state *iter = seq->private;
3035         struct tcp_iter_state *st = &iter->state;
3036         struct hlist_nulls_node *node;
3037         unsigned int expected = 1;
3038         struct sock *sk;
3039 
3040         sock_hold(start_sk);
3041         iter->batch[iter->end_sk++] = start_sk;
3042 
3043         sk = sk_nulls_next(start_sk);
3044         sk_nulls_for_each_from(sk, node) {
3045                 if (seq_sk_match(seq, sk)) {
3046                         if (iter->end_sk < iter->max_sk) {
3047                                 sock_hold(sk);
3048                                 iter->batch[iter->end_sk++] = sk;
3049                         }
3050                         expected++;
3051                 }
3052         }
3053         spin_unlock(&hinfo->lhash2[st->bucket].lock);
3054 
3055         return expected;
3056 }
3057 
3058 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3059                                                    struct sock *start_sk)
3060 {
3061         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3062         struct bpf_tcp_iter_state *iter = seq->private;
3063         struct tcp_iter_state *st = &iter->state;
3064         struct hlist_nulls_node *node;
3065         unsigned int expected = 1;
3066         struct sock *sk;
3067 
3068         sock_hold(start_sk);
3069         iter->batch[iter->end_sk++] = start_sk;
3070 
3071         sk = sk_nulls_next(start_sk);
3072         sk_nulls_for_each_from(sk, node) {
3073                 if (seq_sk_match(seq, sk)) {
3074                         if (iter->end_sk < iter->max_sk) {
3075                                 sock_hold(sk);
3076                                 iter->batch[iter->end_sk++] = sk;
3077                         }
3078                         expected++;
3079                 }
3080         }
3081         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3082 
3083         return expected;
3084 }
3085 
3086 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3087 {
3088         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3089         struct bpf_tcp_iter_state *iter = seq->private;
3090         struct tcp_iter_state *st = &iter->state;
3091         unsigned int expected;
3092         bool resized = false;
3093         struct sock *sk;
3094 
3095         /* The st->bucket is done.  Directly advance to the next
3096          * bucket instead of having the tcp_seek_last_pos() to skip
3097          * one by one in the current bucket and eventually find out
3098          * it has to advance to the next bucket.
3099          */
3100         if (iter->st_bucket_done) {
3101                 st->offset = 0;
3102                 st->bucket++;
3103                 if (st->state == TCP_SEQ_STATE_LISTENING &&
3104                     st->bucket > hinfo->lhash2_mask) {
3105                         st->state = TCP_SEQ_STATE_ESTABLISHED;
3106                         st->bucket = 0;
3107                 }
3108         }
3109 
3110 again:
3111         /* Get a new batch */
3112         iter->cur_sk = 0;
3113         iter->end_sk = 0;
3114         iter->st_bucket_done = false;
3115 
3116         sk = tcp_seek_last_pos(seq);
3117         if (!sk)
3118                 return NULL; /* Done */
3119 
3120         if (st->state == TCP_SEQ_STATE_LISTENING)
3121                 expected = bpf_iter_tcp_listening_batch(seq, sk);
3122         else
3123                 expected = bpf_iter_tcp_established_batch(seq, sk);
3124 
3125         if (iter->end_sk == expected) {
3126                 iter->st_bucket_done = true;
3127                 return sk;
3128         }
3129 
3130         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3131                 resized = true;
3132                 goto again;
3133         }
3134 
3135         return sk;
3136 }
3137 
3138 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3139 {
3140         /* bpf iter does not support lseek, so it always
3141          * continue from where it was stop()-ped.
3142          */
3143         if (*pos)
3144                 return bpf_iter_tcp_batch(seq);
3145 
3146         return SEQ_START_TOKEN;
3147 }
3148 
3149 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3150 {
3151         struct bpf_tcp_iter_state *iter = seq->private;
3152         struct tcp_iter_state *st = &iter->state;
3153         struct sock *sk;
3154 
3155         /* Whenever seq_next() is called, the iter->cur_sk is
3156          * done with seq_show(), so advance to the next sk in
3157          * the batch.
3158          */
3159         if (iter->cur_sk < iter->end_sk) {
3160                 /* Keeping st->num consistent in tcp_iter_state.
3161                  * bpf_iter_tcp does not use st->num.
3162                  * meta.seq_num is used instead.
3163                  */
3164                 st->num++;
3165                 /* Move st->offset to the next sk in the bucket such that
3166                  * the future start() will resume at st->offset in
3167                  * st->bucket.  See tcp_seek_last_pos().
3168                  */
3169                 st->offset++;
3170                 sock_gen_put(iter->batch[iter->cur_sk++]);
3171         }
3172 
3173         if (iter->cur_sk < iter->end_sk)
3174                 sk = iter->batch[iter->cur_sk];
3175         else
3176                 sk = bpf_iter_tcp_batch(seq);
3177 
3178         ++*pos;
3179         /* Keeping st->last_pos consistent in tcp_iter_state.
3180          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3181          */
3182         st->last_pos = *pos;
3183         return sk;
3184 }
3185 
3186 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3187 {
3188         struct bpf_iter_meta meta;
3189         struct bpf_prog *prog;
3190         struct sock *sk = v;
3191         uid_t uid;
3192         int ret;
3193 
3194         if (v == SEQ_START_TOKEN)
3195                 return 0;
3196 
3197         if (sk_fullsock(sk))
3198                 lock_sock(sk);
3199 
3200         if (unlikely(sk_unhashed(sk))) {
3201                 ret = SEQ_SKIP;
3202                 goto unlock;
3203         }
3204 
3205         if (sk->sk_state == TCP_TIME_WAIT) {
3206                 uid = 0;
3207         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3208                 const struct request_sock *req = v;
3209 
3210                 uid = from_kuid_munged(seq_user_ns(seq),
3211                                        sock_i_uid(req->rsk_listener));
3212         } else {
3213                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3214         }
3215 
3216         meta.seq = seq;
3217         prog = bpf_iter_get_info(&meta, false);
3218         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3219 
3220 unlock:
3221         if (sk_fullsock(sk))
3222                 release_sock(sk);
3223         return ret;
3224 
3225 }
3226 
3227 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3228 {
3229         struct bpf_tcp_iter_state *iter = seq->private;
3230         struct bpf_iter_meta meta;
3231         struct bpf_prog *prog;
3232 
3233         if (!v) {
3234                 meta.seq = seq;
3235                 prog = bpf_iter_get_info(&meta, true);
3236                 if (prog)
3237                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3238         }
3239 
3240         if (iter->cur_sk < iter->end_sk) {
3241                 bpf_iter_tcp_put_batch(iter);
3242                 iter->st_bucket_done = false;
3243         }
3244 }
3245 
3246 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3247         .show           = bpf_iter_tcp_seq_show,
3248         .start          = bpf_iter_tcp_seq_start,
3249         .next           = bpf_iter_tcp_seq_next,
3250         .stop           = bpf_iter_tcp_seq_stop,
3251 };
3252 #endif
3253 static unsigned short seq_file_family(const struct seq_file *seq)
3254 {
3255         const struct tcp_seq_afinfo *afinfo;
3256 
3257 #ifdef CONFIG_BPF_SYSCALL
3258         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3259         if (seq->op == &bpf_iter_tcp_seq_ops)
3260                 return AF_UNSPEC;
3261 #endif
3262 
3263         /* Iterated from proc fs */
3264         afinfo = pde_data(file_inode(seq->file));
3265         return afinfo->family;
3266 }
3267 
3268 static const struct seq_operations tcp4_seq_ops = {
3269         .show           = tcp4_seq_show,
3270         .start          = tcp_seq_start,
3271         .next           = tcp_seq_next,
3272         .stop           = tcp_seq_stop,
3273 };
3274 
3275 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3276         .family         = AF_INET,
3277 };
3278 
3279 static int __net_init tcp4_proc_init_net(struct net *net)
3280 {
3281         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3282                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3283                 return -ENOMEM;
3284         return 0;
3285 }
3286 
3287 static void __net_exit tcp4_proc_exit_net(struct net *net)
3288 {
3289         remove_proc_entry("tcp", net->proc_net);
3290 }
3291 
3292 static struct pernet_operations tcp4_net_ops = {
3293         .init = tcp4_proc_init_net,
3294         .exit = tcp4_proc_exit_net,
3295 };
3296 
3297 int __init tcp4_proc_init(void)
3298 {
3299         return register_pernet_subsys(&tcp4_net_ops);
3300 }
3301 
3302 void tcp4_proc_exit(void)
3303 {
3304         unregister_pernet_subsys(&tcp4_net_ops);
3305 }
3306 #endif /* CONFIG_PROC_FS */
3307 
3308 /* @wake is one when sk_stream_write_space() calls us.
3309  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3310  * This mimics the strategy used in sock_def_write_space().
3311  */
3312 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3313 {
3314         const struct tcp_sock *tp = tcp_sk(sk);
3315         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3316                             READ_ONCE(tp->snd_nxt);
3317 
3318         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3319 }
3320 EXPORT_SYMBOL(tcp_stream_memory_free);
3321 
3322 struct proto tcp_prot = {
3323         .name                   = "TCP",
3324         .owner                  = THIS_MODULE,
3325         .close                  = tcp_close,
3326         .pre_connect            = tcp_v4_pre_connect,
3327         .connect                = tcp_v4_connect,
3328         .disconnect             = tcp_disconnect,
3329         .accept                 = inet_csk_accept,
3330         .ioctl                  = tcp_ioctl,
3331         .init                   = tcp_v4_init_sock,
3332         .destroy                = tcp_v4_destroy_sock,
3333         .shutdown               = tcp_shutdown,
3334         .setsockopt             = tcp_setsockopt,
3335         .getsockopt             = tcp_getsockopt,
3336         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3337         .keepalive              = tcp_set_keepalive,
3338         .recvmsg                = tcp_recvmsg,
3339         .sendmsg                = tcp_sendmsg,
3340         .splice_eof             = tcp_splice_eof,
3341         .backlog_rcv            = tcp_v4_do_rcv,
3342         .release_cb             = tcp_release_cb,
3343         .hash                   = inet_hash,
3344         .unhash                 = inet_unhash,
3345         .get_port               = inet_csk_get_port,
3346         .put_port               = inet_put_port,
3347 #ifdef CONFIG_BPF_SYSCALL
3348         .psock_update_sk_prot   = tcp_bpf_update_proto,
3349 #endif
3350         .enter_memory_pressure  = tcp_enter_memory_pressure,
3351         .leave_memory_pressure  = tcp_leave_memory_pressure,
3352         .stream_memory_free     = tcp_stream_memory_free,
3353         .sockets_allocated      = &tcp_sockets_allocated,
3354         .orphan_count           = &tcp_orphan_count,
3355 
3356         .memory_allocated       = &tcp_memory_allocated,
3357         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3358 
3359         .memory_pressure        = &tcp_memory_pressure,
3360         .sysctl_mem             = sysctl_tcp_mem,
3361         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3362         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3363         .max_header             = MAX_TCP_HEADER,
3364         .obj_size               = sizeof(struct tcp_sock),
3365         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3366         .twsk_prot              = &tcp_timewait_sock_ops,
3367         .rsk_prot               = &tcp_request_sock_ops,
3368         .h.hashinfo             = NULL,
3369         .no_autobind            = true,
3370         .diag_destroy           = tcp_abort,
3371 };
3372 EXPORT_SYMBOL(tcp_prot);
3373 
3374 static void __net_exit tcp_sk_exit(struct net *net)
3375 {
3376         if (net->ipv4.tcp_congestion_control)
3377                 bpf_module_put(net->ipv4.tcp_congestion_control,
3378                                net->ipv4.tcp_congestion_control->owner);
3379 }
3380 
3381 static void __net_init tcp_set_hashinfo(struct net *net)
3382 {
3383         struct inet_hashinfo *hinfo;
3384         unsigned int ehash_entries;
3385         struct net *old_net;
3386 
3387         if (net_eq(net, &init_net))
3388                 goto fallback;
3389 
3390         old_net = current->nsproxy->net_ns;
3391         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3392         if (!ehash_entries)
3393                 goto fallback;
3394 
3395         ehash_entries = roundup_pow_of_two(ehash_entries);
3396         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3397         if (!hinfo) {
3398                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3399                         "for a netns, fallback to the global one\n",
3400                         ehash_entries);
3401 fallback:
3402                 hinfo = &tcp_hashinfo;
3403                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3404         }
3405 
3406         net->ipv4.tcp_death_row.hashinfo = hinfo;
3407         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3408         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3409 }
3410 
3411 static int __net_init tcp_sk_init(struct net *net)
3412 {
3413         net->ipv4.sysctl_tcp_ecn = 2;
3414         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3415 
3416         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3417         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3418         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3419         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3420         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3421 
3422         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3423         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3424         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3425 
3426         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3427         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3428         net->ipv4.sysctl_tcp_syncookies = 1;
3429         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3430         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3431         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3432         net->ipv4.sysctl_tcp_orphan_retries = 0;
3433         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3434         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3435         net->ipv4.sysctl_tcp_tw_reuse = 2;
3436         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3437 
3438         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3439         tcp_set_hashinfo(net);
3440 
3441         net->ipv4.sysctl_tcp_sack = 1;
3442         net->ipv4.sysctl_tcp_window_scaling = 1;
3443         net->ipv4.sysctl_tcp_timestamps = 1;
3444         net->ipv4.sysctl_tcp_early_retrans = 3;
3445         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3446         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3447         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3448         net->ipv4.sysctl_tcp_max_reordering = 300;
3449         net->ipv4.sysctl_tcp_dsack = 1;
3450         net->ipv4.sysctl_tcp_app_win = 31;
3451         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3452         net->ipv4.sysctl_tcp_frto = 2;
3453         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3454         /* This limits the percentage of the congestion window which we
3455          * will allow a single TSO frame to consume.  Building TSO frames
3456          * which are too large can cause TCP streams to be bursty.
3457          */
3458         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3459         /* Default TSQ limit of 16 TSO segments */
3460         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3461 
3462         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3463         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3464 
3465         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3466         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3467         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3468         net->ipv4.sysctl_tcp_autocorking = 1;
3469         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3470         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3471         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3472         if (net != &init_net) {
3473                 memcpy(net->ipv4.sysctl_tcp_rmem,
3474                        init_net.ipv4.sysctl_tcp_rmem,
3475                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3476                 memcpy(net->ipv4.sysctl_tcp_wmem,
3477                        init_net.ipv4.sysctl_tcp_wmem,
3478                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3479         }
3480         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3481         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3482         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3483         net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3484         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3485         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3486         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3487 
3488         /* Set default values for PLB */
3489         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3490         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3491         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3492         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3493         /* Default congestion threshold for PLB to mark a round is 50% */
3494         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3495 
3496         /* Reno is always built in */
3497         if (!net_eq(net, &init_net) &&
3498             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3499                                init_net.ipv4.tcp_congestion_control->owner))
3500                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3501         else
3502                 net->ipv4.tcp_congestion_control = &tcp_reno;
3503 
3504         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3505         net->ipv4.sysctl_tcp_shrink_window = 0;
3506 
3507         net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3508         net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3509 
3510         return 0;
3511 }
3512 
3513 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3514 {
3515         struct net *net;
3516 
3517         tcp_twsk_purge(net_exit_list);
3518 
3519         list_for_each_entry(net, net_exit_list, exit_list) {
3520                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3521                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3522                 tcp_fastopen_ctx_destroy(net);
3523         }
3524 }
3525 
3526 static struct pernet_operations __net_initdata tcp_sk_ops = {
3527        .init       = tcp_sk_init,
3528        .exit       = tcp_sk_exit,
3529        .exit_batch = tcp_sk_exit_batch,
3530 };
3531 
3532 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3533 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3534                      struct sock_common *sk_common, uid_t uid)
3535 
3536 #define INIT_BATCH_SZ 16
3537 
3538 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3539 {
3540         struct bpf_tcp_iter_state *iter = priv_data;
3541         int err;
3542 
3543         err = bpf_iter_init_seq_net(priv_data, aux);
3544         if (err)
3545                 return err;
3546 
3547         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3548         if (err) {
3549                 bpf_iter_fini_seq_net(priv_data);
3550                 return err;
3551         }
3552 
3553         return 0;
3554 }
3555 
3556 static void bpf_iter_fini_tcp(void *priv_data)
3557 {
3558         struct bpf_tcp_iter_state *iter = priv_data;
3559 
3560         bpf_iter_fini_seq_net(priv_data);
3561         kvfree(iter->batch);
3562 }
3563 
3564 static const struct bpf_iter_seq_info tcp_seq_info = {
3565         .seq_ops                = &bpf_iter_tcp_seq_ops,
3566         .init_seq_private       = bpf_iter_init_tcp,
3567         .fini_seq_private       = bpf_iter_fini_tcp,
3568         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3569 };
3570 
3571 static const struct bpf_func_proto *
3572 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3573                             const struct bpf_prog *prog)
3574 {
3575         switch (func_id) {
3576         case BPF_FUNC_setsockopt:
3577                 return &bpf_sk_setsockopt_proto;
3578         case BPF_FUNC_getsockopt:
3579                 return &bpf_sk_getsockopt_proto;
3580         default:
3581                 return NULL;
3582         }
3583 }
3584 
3585 static struct bpf_iter_reg tcp_reg_info = {
3586         .target                 = "tcp",
3587         .ctx_arg_info_size      = 1,
3588         .ctx_arg_info           = {
3589                 { offsetof(struct bpf_iter__tcp, sk_common),
3590                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3591         },
3592         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3593         .seq_info               = &tcp_seq_info,
3594 };
3595 
3596 static void __init bpf_iter_register(void)
3597 {
3598         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3599         if (bpf_iter_reg_target(&tcp_reg_info))
3600                 pr_warn("Warning: could not register bpf iterator tcp\n");
3601 }
3602 
3603 #endif
3604 
3605 void __init tcp_v4_init(void)
3606 {
3607         int cpu, res;
3608 
3609         for_each_possible_cpu(cpu) {
3610                 struct sock *sk;
3611 
3612                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3613                                            IPPROTO_TCP, &init_net);
3614                 if (res)
3615                         panic("Failed to create the TCP control socket.\n");
3616                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3617 
3618                 /* Please enforce IP_DF and IPID==0 for RST and
3619                  * ACK sent in SYN-RECV and TIME-WAIT state.
3620                  */
3621                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3622 
3623                 sk->sk_clockid = CLOCK_MONOTONIC;
3624 
3625                 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3626         }
3627         if (register_pernet_subsys(&tcp_sk_ops))
3628                 panic("Failed to create the TCP control socket.\n");
3629 
3630 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3631         bpf_iter_register();
3632 #endif
3633 }
3634
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/net/ipv4/tcp_ipv4.c

TOMOYO Linux Cross Reference
Linux/net/ipv4/tcp_ipv4.c