~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/inet_timewait_sock.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  4  *              operating system.  INET is implemented using the  BSD Socket
  5  *              interface as the means of communication with the user level.
  6  *
  7  *              Generic TIME_WAIT sockets functions
  8  *
  9  *              From code orinally in TCP
 10  */
 11 
 12 #include <linux/kernel.h>
 13 #include <linux/slab.h>
 14 #include <linux/module.h>
 15 #include <net/inet_hashtables.h>
 16 #include <net/inet_timewait_sock.h>
 17 #include <net/ip.h>
 18 
 19 
 20 /**
 21  *      inet_twsk_bind_unhash - unhash a timewait socket from bind hash
 22  *      @tw: timewait socket
 23  *      @hashinfo: hashinfo pointer
 24  *
 25  *      unhash a timewait socket from bind hash, if hashed.
 26  *      bind hash lock must be held by caller.
 27  *      Returns 1 if caller should call inet_twsk_put() after lock release.
 28  */
 29 void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 30                           struct inet_hashinfo *hashinfo)
 31 {
 32         struct inet_bind2_bucket *tb2 = tw->tw_tb2;
 33         struct inet_bind_bucket *tb = tw->tw_tb;
 34 
 35         if (!tb)
 36                 return;
 37 
 38         __sk_del_bind_node((struct sock *)tw);
 39         tw->tw_tb = NULL;
 40         tw->tw_tb2 = NULL;
 41         inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
 42         inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
 43 
 44         __sock_put((struct sock *)tw);
 45 }
 46 
 47 /* Must be called with locally disabled BHs. */
 48 static void inet_twsk_kill(struct inet_timewait_sock *tw)
 49 {
 50         struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
 51         spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 52         struct inet_bind_hashbucket *bhead, *bhead2;
 53 
 54         spin_lock(lock);
 55         sk_nulls_del_node_init_rcu((struct sock *)tw);
 56         spin_unlock(lock);
 57 
 58         /* Disassociate with bind bucket. */
 59         bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 60                         hashinfo->bhash_size)];
 61         bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
 62                                        twsk_net(tw), tw->tw_num);
 63 
 64         spin_lock(&bhead->lock);
 65         spin_lock(&bhead2->lock);
 66         inet_twsk_bind_unhash(tw, hashinfo);
 67         spin_unlock(&bhead2->lock);
 68         spin_unlock(&bhead->lock);
 69 
 70         refcount_dec(&tw->tw_dr->tw_refcount);
 71         inet_twsk_put(tw);
 72 }
 73 
 74 void inet_twsk_free(struct inet_timewait_sock *tw)
 75 {
 76         struct module *owner = tw->tw_prot->owner;
 77         twsk_destructor((struct sock *)tw);
 78         kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
 79         module_put(owner);
 80 }
 81 
 82 void inet_twsk_put(struct inet_timewait_sock *tw)
 83 {
 84         if (refcount_dec_and_test(&tw->tw_refcnt))
 85                 inet_twsk_free(tw);
 86 }
 87 EXPORT_SYMBOL_GPL(inet_twsk_put);
 88 
 89 static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
 90                                    struct hlist_nulls_head *list)
 91 {
 92         hlist_nulls_add_head_rcu(&tw->tw_node, list);
 93 }
 94 
 95 static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
 96 {
 97         __inet_twsk_schedule(tw, timeo, false);
 98 }
 99 
100 /*
101  * Enter the time wait state.
102  * Essentially we whip up a timewait bucket, copy the relevant info into it
103  * from the SK, and mess with hash chains and list linkage.
104  *
105  * The caller must not access @tw anymore after this function returns.
106  */
107 void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
108                                   struct sock *sk,
109                                   struct inet_hashinfo *hashinfo,
110                                   int timeo)
111 {
112         const struct inet_sock *inet = inet_sk(sk);
113         const struct inet_connection_sock *icsk = inet_csk(sk);
114         struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
115         spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
116         struct inet_bind_hashbucket *bhead, *bhead2;
117 
118         /* Step 1: Put TW into bind hash. Original socket stays there too.
119            Note, that any socket with inet->num != 0 MUST be bound in
120            binding cache, even if it is closed.
121          */
122         bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
123                         hashinfo->bhash_size)];
124         bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
125 
126         local_bh_disable();
127         spin_lock(&bhead->lock);
128         spin_lock(&bhead2->lock);
129 
130         tw->tw_tb = icsk->icsk_bind_hash;
131         WARN_ON(!icsk->icsk_bind_hash);
132 
133         tw->tw_tb2 = icsk->icsk_bind2_hash;
134         WARN_ON(!icsk->icsk_bind2_hash);
135         sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners);
136 
137         spin_unlock(&bhead2->lock);
138         spin_unlock(&bhead->lock);
139 
140         spin_lock(lock);
141 
142         /* Step 2: Hash TW into tcp ehash chain */
143         inet_twsk_add_node_rcu(tw, &ehead->chain);
144 
145         /* Step 3: Remove SK from hash chain */
146         if (__sk_nulls_del_node_init_rcu(sk))
147                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
148 
149 
150         /* Ensure above writes are committed into memory before updating the
151          * refcount.
152          * Provides ordering vs later refcount_inc().
153          */
154         smp_wmb();
155         /* tw_refcnt is set to 3 because we have :
156          * - one reference for bhash chain.
157          * - one reference for ehash chain.
158          * - one reference for timer.
159          * Also note that after this point, we lost our implicit reference
160          * so we are not allowed to use tw anymore.
161          */
162         refcount_set(&tw->tw_refcnt, 3);
163 
164         inet_twsk_schedule(tw, timeo);
165 
166         spin_unlock(lock);
167         local_bh_enable();
168 }
169 EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule);
170 
171 static void tw_timer_handler(struct timer_list *t)
172 {
173         struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
174 
175         inet_twsk_kill(tw);
176 }
177 
178 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
179                                            struct inet_timewait_death_row *dr,
180                                            const int state)
181 {
182         struct inet_timewait_sock *tw;
183 
184         if (refcount_read(&dr->tw_refcount) - 1 >=
185             READ_ONCE(dr->sysctl_max_tw_buckets))
186                 return NULL;
187 
188         tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
189                               GFP_ATOMIC);
190         if (tw) {
191                 const struct inet_sock *inet = inet_sk(sk);
192 
193                 tw->tw_dr           = dr;
194                 /* Give us an identity. */
195                 tw->tw_daddr        = inet->inet_daddr;
196                 tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
197                 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
198                 tw->tw_tos          = inet->tos;
199                 tw->tw_num          = inet->inet_num;
200                 tw->tw_state        = TCP_TIME_WAIT;
201                 tw->tw_substate     = state;
202                 tw->tw_sport        = inet->inet_sport;
203                 tw->tw_dport        = inet->inet_dport;
204                 tw->tw_family       = sk->sk_family;
205                 tw->tw_reuse        = sk->sk_reuse;
206                 tw->tw_reuseport    = sk->sk_reuseport;
207                 tw->tw_hash         = sk->sk_hash;
208                 tw->tw_ipv6only     = 0;
209                 tw->tw_transparent  = inet_test_bit(TRANSPARENT, sk);
210                 tw->tw_prot         = sk->sk_prot_creator;
211                 atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
212                 twsk_net_set(tw, sock_net(sk));
213                 timer_setup(&tw->tw_timer, tw_timer_handler, 0);
214                 /*
215                  * Because we use RCU lookups, we should not set tw_refcnt
216                  * to a non null value before everything is setup for this
217                  * timewait socket.
218                  */
219                 refcount_set(&tw->tw_refcnt, 0);
220 
221                 __module_get(tw->tw_prot->owner);
222         }
223 
224         return tw;
225 }
226 EXPORT_SYMBOL_GPL(inet_twsk_alloc);
227 
228 /* These are always called from BH context.  See callers in
229  * tcp_input.c to verify this.
230  */
231 
232 /* This is for handling early-kills of TIME_WAIT sockets.
233  * Warning : consume reference.
234  * Caller should not access tw anymore.
235  */
236 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
237 {
238         struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
239         spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
240 
241         /* inet_twsk_purge() walks over all sockets, including tw ones,
242          * and removes them via inet_twsk_deschedule_put() after a
243          * refcount_inc_not_zero().
244          *
245          * inet_twsk_hashdance_schedule() must (re)init the refcount before
246          * arming the timer, i.e. inet_twsk_purge can obtain a reference to
247          * a twsk that did not yet schedule the timer.
248          *
249          * The ehash lock synchronizes these two:
250          * After acquiring the lock, the timer is always scheduled (else
251          * timer_shutdown returns false), because hashdance_schedule releases
252          * the ehash lock only after completing the timer initialization.
253          *
254          * Without grabbing the ehash lock, we get:
255          * 1) cpu x sets twsk refcount to 3
256          * 2) cpu y bumps refcount to 4
257          * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down
258          * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown
259          * -> timer refcount is never decremented.
260          */
261         spin_lock(lock);
262         /*  Makes sure hashdance_schedule() has completed */
263         spin_unlock(lock);
264 
265         if (timer_shutdown_sync(&tw->tw_timer))
266                 inet_twsk_kill(tw);
267         inet_twsk_put(tw);
268 }
269 EXPORT_SYMBOL(inet_twsk_deschedule_put);
270 
271 void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
272 {
273         /* timeout := RTO * 3.5
274          *
275          * 3.5 = 1+2+0.5 to wait for two retransmits.
276          *
277          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
278          * our ACK acking that FIN can be lost. If N subsequent retransmitted
279          * FINs (or previous seqments) are lost (probability of such event
280          * is p^(N+1), where p is probability to lose single packet and
281          * time to detect the loss is about RTO*(2^N - 1) with exponential
282          * backoff). Normal timewait length is calculated so, that we
283          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
284          * [ BTW Linux. following BSD, violates this requirement waiting
285          *   only for 60sec, we should wait at least for 240 secs.
286          *   Well, 240 consumes too much of resources 8)
287          * ]
288          * This interval is not reduced to catch old duplicate and
289          * responces to our wandering segments living for two MSLs.
290          * However, if we use PAWS to detect
291          * old duplicates, we can reduce the interval to bounds required
292          * by RTO, rather than MSL. So, if peer understands PAWS, we
293          * kill tw bucket after 3.5*RTO (it is important that this number
294          * is greater than TS tick!) and detect old duplicates with help
295          * of PAWS.
296          */
297 
298         if (!rearm) {
299                 bool kill = timeo <= 4*HZ;
300 
301                 __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
302                                                      LINUX_MIB_TIMEWAITED);
303                 BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
304                 refcount_inc(&tw->tw_dr->tw_refcount);
305         } else {
306                 mod_timer_pending(&tw->tw_timer, jiffies + timeo);
307         }
308 }
309 EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
310 
311 /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
312 void inet_twsk_purge(struct inet_hashinfo *hashinfo)
313 {
314         struct inet_ehash_bucket *head = &hashinfo->ehash[0];
315         unsigned int ehash_mask = hashinfo->ehash_mask;
316         struct hlist_nulls_node *node;
317         unsigned int slot;
318         struct sock *sk;
319 
320         for (slot = 0; slot <= ehash_mask; slot++, head++) {
321                 if (hlist_nulls_empty(&head->chain))
322                         continue;
323 
324 restart_rcu:
325                 cond_resched();
326                 rcu_read_lock();
327 restart:
328                 sk_nulls_for_each_rcu(sk, node, &head->chain) {
329                         int state = inet_sk_state_load(sk);
330 
331                         if ((1 << state) & ~(TCPF_TIME_WAIT |
332                                              TCPF_NEW_SYN_RECV))
333                                 continue;
334 
335                         if (refcount_read(&sock_net(sk)->ns.count))
336                                 continue;
337 
338                         if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
339                                 continue;
340 
341                         if (refcount_read(&sock_net(sk)->ns.count)) {
342                                 sock_gen_put(sk);
343                                 goto restart;
344                         }
345 
346                         rcu_read_unlock();
347                         local_bh_disable();
348                         if (state == TCP_TIME_WAIT) {
349                                 inet_twsk_deschedule_put(inet_twsk(sk));
350                         } else {
351                                 struct request_sock *req = inet_reqsk(sk);
352 
353                                 inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
354                                                                   req);
355                         }
356                         local_bh_enable();
357                         goto restart_rcu;
358                 }
359                 /* If the nulls value we got at the end of this lookup is
360                  * not the expected one, we must restart lookup.
361                  * We probably met an item that was moved to another chain.
362                  */
363                 if (get_nulls_value(node) != slot)
364                         goto restart;
365                 rcu_read_unlock();
366         }
367 }
368 EXPORT_SYMBOL_GPL(inet_twsk_purge);
369 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php