~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
  2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
  3 
  4 #define BPF_NO_KFUNC_PROTOTYPES
  5 #include "vmlinux.h"
  6 
  7 #include <bpf/bpf_helpers.h>
  8 #include <bpf/bpf_endian.h>
  9 #include <asm/errno.h>
 10 
 11 #include "bpf_compiler.h"
 12 
 13 #define TC_ACT_OK 0
 14 #define TC_ACT_SHOT 2
 15 
 16 #define NSEC_PER_SEC 1000000000L
 17 
 18 #define ETH_ALEN 6
 19 #define ETH_P_IP 0x0800
 20 #define ETH_P_IPV6 0x86DD
 21 
 22 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
 23 
 24 #define IP_DF 0x4000
 25 #define IP_MF 0x2000
 26 #define IP_OFFSET 0x1fff
 27 
 28 #define NEXTHDR_TCP 6
 29 
 30 #define TCPOPT_NOP 1
 31 #define TCPOPT_EOL 0
 32 #define TCPOPT_MSS 2
 33 #define TCPOPT_WINDOW 3
 34 #define TCPOPT_SACK_PERM 4
 35 #define TCPOPT_TIMESTAMP 8
 36 
 37 #define TCPOLEN_MSS 4
 38 #define TCPOLEN_WINDOW 3
 39 #define TCPOLEN_SACK_PERM 2
 40 #define TCPOLEN_TIMESTAMP 10
 41 
 42 #define TCP_TS_HZ 1000
 43 #define TS_OPT_WSCALE_MASK 0xf
 44 #define TS_OPT_SACK (1 << 4)
 45 #define TS_OPT_ECN (1 << 5)
 46 #define TSBITS 6
 47 #define TSMASK (((__u32)1 << TSBITS) - 1)
 48 #define TCP_MAX_WSCALE 14U
 49 
 50 #define IPV4_MAXLEN 60
 51 #define TCP_MAXLEN 60
 52 
 53 #define DEFAULT_MSS4 1460
 54 #define DEFAULT_MSS6 1440
 55 #define DEFAULT_WSCALE 7
 56 #define DEFAULT_TTL 64
 57 #define MAX_ALLOWED_PORTS 8
 58 
 59 #define MAX_PACKET_OFF 0xffff
 60 
 61 #define swap(a, b) \
 62         do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 63 
 64 #define __get_unaligned_t(type, ptr) ({                                         \
 65         const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
 66         __pptr->x;                                                              \
 67 })
 68 
 69 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
 70 
 71 struct {
 72         __uint(type, BPF_MAP_TYPE_ARRAY);
 73         __type(key, __u32);
 74         __type(value, __u64);
 75         __uint(max_entries, 2);
 76 } values SEC(".maps");
 77 
 78 struct {
 79         __uint(type, BPF_MAP_TYPE_ARRAY);
 80         __type(key, __u32);
 81         __type(value, __u16);
 82         __uint(max_entries, MAX_ALLOWED_PORTS);
 83 } allowed_ports SEC(".maps");
 84 
 85 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
 86  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
 87  */
 88 
 89 struct bpf_ct_opts___local {
 90         s32 netns_id;
 91         s32 error;
 92         u8 l4proto;
 93         u8 dir;
 94         u8 reserved[2];
 95 } __attribute__((preserve_access_index));
 96 
 97 #define BPF_F_CURRENT_NETNS (-1)
 98 
 99 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
100                                          struct bpf_sock_tuple *bpf_tuple,
101                                          __u32 len_tuple,
102                                          struct bpf_ct_opts___local *opts,
103                                          __u32 len_opts) __ksym;
104 
105 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
106                                          struct bpf_sock_tuple *bpf_tuple,
107                                          u32 len_tuple,
108                                          struct bpf_ct_opts___local *opts,
109                                          u32 len_opts) __ksym;
110 
111 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
112 
113 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
114 {
115         __u8 tmp[ETH_ALEN];
116 
117         __builtin_memcpy(tmp, a, ETH_ALEN);
118         __builtin_memcpy(a, b, ETH_ALEN);
119         __builtin_memcpy(b, tmp, ETH_ALEN);
120 }
121 
122 static __always_inline __u16 csum_fold(__u32 csum)
123 {
124         csum = (csum & 0xffff) + (csum >> 16);
125         csum = (csum & 0xffff) + (csum >> 16);
126         return (__u16)~csum;
127 }
128 
129 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
130                                                __u32 len, __u8 proto,
131                                                __u32 csum)
132 {
133         __u64 s = csum;
134 
135         s += (__u32)saddr;
136         s += (__u32)daddr;
137 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
138         s += proto + len;
139 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
140         s += (proto + len) << 8;
141 #else
142 #error Unknown endian
143 #endif
144         s = (s & 0xffffffff) + (s >> 32);
145         s = (s & 0xffffffff) + (s >> 32);
146 
147         return csum_fold((__u32)s);
148 }
149 
150 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
151                                              const struct in6_addr *daddr,
152                                              __u32 len, __u8 proto, __u32 csum)
153 {
154         __u64 sum = csum;
155         int i;
156 
157         __pragma_loop_unroll
158         for (i = 0; i < 4; i++)
159                 sum += (__u32)saddr->in6_u.u6_addr32[i];
160 
161         __pragma_loop_unroll
162         for (i = 0; i < 4; i++)
163                 sum += (__u32)daddr->in6_u.u6_addr32[i];
164 
165         /* Don't combine additions to avoid 32-bit overflow. */
166         sum += bpf_htonl(len);
167         sum += bpf_htonl(proto);
168 
169         sum = (sum & 0xffffffff) + (sum >> 32);
170         sum = (sum & 0xffffffff) + (sum >> 32);
171 
172         return csum_fold((__u32)sum);
173 }
174 
175 static __always_inline __u64 tcp_clock_ns(void)
176 {
177         return bpf_ktime_get_ns();
178 }
179 
180 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
181 {
182         return ns / (NSEC_PER_SEC / TCP_TS_HZ);
183 }
184 
185 static __always_inline __u32 tcp_clock_ms(void)
186 {
187         return tcp_ns_to_ts(tcp_clock_ns());
188 }
189 
190 struct tcpopt_context {
191         void *data;
192         void *data_end;
193         __be32 *tsecr;
194         __u8 wscale;
195         bool option_timestamp;
196         bool option_sack;
197         __u32 off;
198 };
199 
200 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
201 {
202         __u64 off = ctx->off;
203         __u8 *data;
204 
205         /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
206         if (off > MAX_PACKET_OFF - sz)
207                 return NULL;
208 
209         data = ctx->data + off;
210         barrier_var(data);
211         if (data + sz >= ctx->data_end)
212                 return NULL;
213 
214         ctx->off += sz;
215         return data;
216 }
217 
218 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
219 {
220         __u8 *opcode, *opsize, *wscale, *tsecr;
221         __u32 off = ctx->off;
222 
223         opcode = next(ctx, 1);
224         if (!opcode)
225                 return 1;
226 
227         if (*opcode == TCPOPT_EOL)
228                 return 1;
229         if (*opcode == TCPOPT_NOP)
230                 return 0;
231 
232         opsize = next(ctx, 1);
233         if (!opsize || *opsize < 2)
234                 return 1;
235 
236         switch (*opcode) {
237         case TCPOPT_WINDOW:
238                 wscale = next(ctx, 1);
239                 if (!wscale)
240                         return 1;
241                 if (*opsize == TCPOLEN_WINDOW)
242                         ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
243                 break;
244         case TCPOPT_TIMESTAMP:
245                 tsecr = next(ctx, 4);
246                 if (!tsecr)
247                         return 1;
248                 if (*opsize == TCPOLEN_TIMESTAMP) {
249                         ctx->option_timestamp = true;
250                         /* Client's tsval becomes our tsecr. */
251                         *ctx->tsecr = get_unaligned((__be32 *)tsecr);
252                 }
253                 break;
254         case TCPOPT_SACK_PERM:
255                 if (*opsize == TCPOLEN_SACK_PERM)
256                         ctx->option_sack = true;
257                 break;
258         }
259 
260         ctx->off = off + *opsize;
261 
262         return 0;
263 }
264 
265 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
266 {
267         int i;
268 
269         for (i = 0; i < 7; i++)
270                 if (tscookie_tcpopt_parse(context))
271                         return 1;
272         return 0;
273 }
274 
275 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
276                                           __u16 tcp_len, __be32 *tsval,
277                                           __be32 *tsecr, void *data, void *data_end)
278 {
279         struct tcpopt_context loop_ctx = {
280                 .data = data,
281                 .data_end = data_end,
282                 .tsecr = tsecr,
283                 .wscale = TS_OPT_WSCALE_MASK,
284                 .option_timestamp = false,
285                 .option_sack = false,
286                 /* Note: currently verifier would track .off as unbound scalar.
287                  *       In case if verifier would at some point get smarter and
288                  *       compute bounded value for this var, beware that it might
289                  *       hinder bpf_loop() convergence validation.
290                  */
291                 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
292         };
293         u32 cookie;
294 
295         bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
296 
297         if (!loop_ctx.option_timestamp)
298                 return false;
299 
300         cookie = tcp_clock_ms() & ~TSMASK;
301         cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
302         if (loop_ctx.option_sack)
303                 cookie |= TS_OPT_SACK;
304         if (tcp_header->ece && tcp_header->cwr)
305                 cookie |= TS_OPT_ECN;
306         *tsval = bpf_htonl(cookie);
307 
308         return true;
309 }
310 
311 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
312                                                  __u8 *ttl, bool ipv6)
313 {
314         __u32 key = 0;
315         __u64 *value;
316 
317         value = bpf_map_lookup_elem(&values, &key);
318         if (value && *value != 0) {
319                 if (ipv6)
320                         *mss = (*value >> 32) & 0xffff;
321                 else
322                         *mss = *value & 0xffff;
323                 *wscale = (*value >> 16) & 0xf;
324                 *ttl = (*value >> 24) & 0xff;
325                 return;
326         }
327 
328         *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
329         *wscale = DEFAULT_WSCALE;
330         *ttl = DEFAULT_TTL;
331 }
332 
333 static __always_inline void values_inc_synacks(void)
334 {
335         __u32 key = 1;
336         __u64 *value;
337 
338         value = bpf_map_lookup_elem(&values, &key);
339         if (value)
340                 __sync_fetch_and_add(value, 1);
341 }
342 
343 static __always_inline bool check_port_allowed(__u16 port)
344 {
345         __u32 i;
346 
347         for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
348                 __u32 key = i;
349                 __u16 *value;
350 
351                 value = bpf_map_lookup_elem(&allowed_ports, &key);
352 
353                 if (!value)
354                         break;
355                 /* 0 is a terminator value. Check it first to avoid matching on
356                  * a forbidden port == 0 and returning true.
357                  */
358                 if (*value == 0)
359                         break;
360 
361                 if (*value == port)
362                         return true;
363         }
364 
365         return false;
366 }
367 
368 struct header_pointers {
369         struct ethhdr *eth;
370         struct iphdr *ipv4;
371         struct ipv6hdr *ipv6;
372         struct tcphdr *tcp;
373         __u16 tcp_len;
374 };
375 
376 static __always_inline int tcp_dissect(void *data, void *data_end,
377                                        struct header_pointers *hdr)
378 {
379         hdr->eth = data;
380         if (hdr->eth + 1 > data_end)
381                 return XDP_DROP;
382 
383         switch (bpf_ntohs(hdr->eth->h_proto)) {
384         case ETH_P_IP:
385                 hdr->ipv6 = NULL;
386 
387                 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
388                 if (hdr->ipv4 + 1 > data_end)
389                         return XDP_DROP;
390                 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
391                         return XDP_DROP;
392                 if (hdr->ipv4->version != 4)
393                         return XDP_DROP;
394 
395                 if (hdr->ipv4->protocol != IPPROTO_TCP)
396                         return XDP_PASS;
397 
398                 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
399                 break;
400         case ETH_P_IPV6:
401                 hdr->ipv4 = NULL;
402 
403                 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
404                 if (hdr->ipv6 + 1 > data_end)
405                         return XDP_DROP;
406                 if (hdr->ipv6->version != 6)
407                         return XDP_DROP;
408 
409                 /* XXX: Extension headers are not supported and could circumvent
410                  * XDP SYN flood protection.
411                  */
412                 if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
413                         return XDP_PASS;
414 
415                 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
416                 break;
417         default:
418                 /* XXX: VLANs will circumvent XDP SYN flood protection. */
419                 return XDP_PASS;
420         }
421 
422         if (hdr->tcp + 1 > data_end)
423                 return XDP_DROP;
424         hdr->tcp_len = hdr->tcp->doff * 4;
425         if (hdr->tcp_len < sizeof(*hdr->tcp))
426                 return XDP_DROP;
427 
428         return XDP_TX;
429 }
430 
431 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
432 {
433         struct bpf_ct_opts___local ct_lookup_opts = {
434                 .netns_id = BPF_F_CURRENT_NETNS,
435                 .l4proto = IPPROTO_TCP,
436         };
437         struct bpf_sock_tuple tup = {};
438         struct nf_conn *ct;
439         __u32 tup_size;
440 
441         if (hdr->ipv4) {
442                 /* TCP doesn't normally use fragments, and XDP can't reassemble
443                  * them.
444                  */
445                 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
446                         return XDP_DROP;
447 
448                 tup.ipv4.saddr = hdr->ipv4->saddr;
449                 tup.ipv4.daddr = hdr->ipv4->daddr;
450                 tup.ipv4.sport = hdr->tcp->source;
451                 tup.ipv4.dport = hdr->tcp->dest;
452                 tup_size = sizeof(tup.ipv4);
453         } else if (hdr->ipv6) {
454                 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
455                 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
456                 tup.ipv6.sport = hdr->tcp->source;
457                 tup.ipv6.dport = hdr->tcp->dest;
458                 tup_size = sizeof(tup.ipv6);
459         } else {
460                 /* The verifier can't track that either ipv4 or ipv6 is not
461                  * NULL.
462                  */
463                 return XDP_ABORTED;
464         }
465         if (xdp)
466                 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
467         else
468                 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
469         if (ct) {
470                 unsigned long status = ct->status;
471 
472                 bpf_ct_release(ct);
473                 if (status & IPS_CONFIRMED)
474                         return XDP_PASS;
475         } else if (ct_lookup_opts.error != -ENOENT) {
476                 return XDP_ABORTED;
477         }
478 
479         /* error == -ENOENT || !(status & IPS_CONFIRMED) */
480         return XDP_TX;
481 }
482 
483 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
484                                           __u8 wscale)
485 {
486         __be32 *start = buf;
487 
488         *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
489 
490         if (!tsopt)
491                 return buf - start;
492 
493         if (tsopt[0] & bpf_htonl(1 << 4))
494                 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
495                                    (TCPOLEN_SACK_PERM << 16) |
496                                    (TCPOPT_TIMESTAMP << 8) |
497                                    TCPOLEN_TIMESTAMP);
498         else
499                 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
500                                    (TCPOPT_NOP << 16) |
501                                    (TCPOPT_TIMESTAMP << 8) |
502                                    TCPOLEN_TIMESTAMP);
503         *buf++ = tsopt[0];
504         *buf++ = tsopt[1];
505 
506         if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
507                 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
508                                    (TCPOPT_WINDOW << 16) |
509                                    (TCPOLEN_WINDOW << 8) |
510                                    wscale);
511 
512         return buf - start;
513 }
514 
515 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
516                                            __u32 cookie, __be32 *tsopt,
517                                            __u16 mss, __u8 wscale)
518 {
519         void *tcp_options;
520 
521         tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
522         if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
523                 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
524         tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
525         swap(tcp_header->source, tcp_header->dest);
526         tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
527         tcp_header->seq = bpf_htonl(cookie);
528         tcp_header->window = 0;
529         tcp_header->urg_ptr = 0;
530         tcp_header->check = 0; /* Calculate checksum later. */
531 
532         tcp_options = (void *)(tcp_header + 1);
533         tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
534 }
535 
536 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
537                                              __u32 cookie, __be32 *tsopt)
538 {
539         __u8 wscale;
540         __u16 mss;
541         __u8 ttl;
542 
543         values_get_tcpipopts(&mss, &wscale, &ttl, false);
544 
545         swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
546 
547         swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
548         hdr->ipv4->check = 0; /* Calculate checksum later. */
549         hdr->ipv4->tos = 0;
550         hdr->ipv4->id = 0;
551         hdr->ipv4->ttl = ttl;
552 
553         tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
554 
555         hdr->tcp_len = hdr->tcp->doff * 4;
556         hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
557 }
558 
559 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
560                                              __u32 cookie, __be32 *tsopt)
561 {
562         __u8 wscale;
563         __u16 mss;
564         __u8 ttl;
565 
566         values_get_tcpipopts(&mss, &wscale, &ttl, true);
567 
568         swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
569 
570         swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
571         *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
572         hdr->ipv6->hop_limit = ttl;
573 
574         tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
575 
576         hdr->tcp_len = hdr->tcp->doff * 4;
577         hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
578 }
579 
580 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
581                                                 void *ctx,
582                                                 void *data, void *data_end,
583                                                 bool xdp)
584 {
585         __u32 old_pkt_size, new_pkt_size;
586         /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
587          * BPF verifier if tsopt is not volatile. Volatile forces it to store
588          * the pointer value and use it directly, otherwise tcp_mkoptions is
589          * (mis)compiled like this:
590          *   if (!tsopt)
591          *       return buf - start;
592          *   reg = stored_return_value_of_tscookie_init;
593          *   if (reg)
594          *       tsopt = tsopt_buf;
595          *   else
596          *       tsopt = NULL;
597          *   ...
598          *   *buf++ = tsopt[1];
599          * It creates a dead branch where tsopt is assigned NULL, but the
600          * verifier can't prove it's dead and blocks the program.
601          */
602         __be32 * volatile tsopt = NULL;
603         __be32 tsopt_buf[2] = {};
604         __u16 ip_len;
605         __u32 cookie;
606         __s64 value;
607 
608         /* Checksum is not yet verified, but both checksum failure and TCP
609          * header checks return XDP_DROP, so the order doesn't matter.
610          */
611         if (hdr->tcp->fin || hdr->tcp->rst)
612                 return XDP_DROP;
613 
614         /* Issue SYN cookies on allowed ports, drop SYN packets on blocked
615          * ports.
616          */
617         if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
618                 return XDP_DROP;
619 
620         if (hdr->ipv4) {
621                 /* Check the IPv4 and TCP checksums before creating a SYNACK. */
622                 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
623                 if (value < 0)
624                         return XDP_ABORTED;
625                 if (csum_fold(value) != 0)
626                         return XDP_DROP; /* Bad IPv4 checksum. */
627 
628                 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
629                 if (value < 0)
630                         return XDP_ABORTED;
631                 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
632                                       hdr->tcp_len, IPPROTO_TCP, value) != 0)
633                         return XDP_DROP; /* Bad TCP checksum. */
634 
635                 ip_len = sizeof(*hdr->ipv4);
636 
637                 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
638                                                        hdr->tcp_len);
639         } else if (hdr->ipv6) {
640                 /* Check the TCP checksum before creating a SYNACK. */
641                 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
642                 if (value < 0)
643                         return XDP_ABORTED;
644                 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
645                                     hdr->tcp_len, IPPROTO_TCP, value) != 0)
646                         return XDP_DROP; /* Bad TCP checksum. */
647 
648                 ip_len = sizeof(*hdr->ipv6);
649 
650                 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
651                                                        hdr->tcp_len);
652         } else {
653                 return XDP_ABORTED;
654         }
655 
656         if (value < 0)
657                 return XDP_ABORTED;
658         cookie = (__u32)value;
659 
660         if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
661                           &tsopt_buf[0], &tsopt_buf[1], data, data_end))
662                 tsopt = tsopt_buf;
663 
664         /* Check that there is enough space for a SYNACK. It also covers
665          * the check that the destination of the __builtin_memmove below
666          * doesn't overflow.
667          */
668         if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
669                 return XDP_ABORTED;
670 
671         if (hdr->ipv4) {
672                 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
673                         struct tcphdr *new_tcp_header;
674 
675                         new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
676                         __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
677                         hdr->tcp = new_tcp_header;
678 
679                         hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
680                 }
681 
682                 tcpv4_gen_synack(hdr, cookie, tsopt);
683         } else if (hdr->ipv6) {
684                 tcpv6_gen_synack(hdr, cookie, tsopt);
685         } else {
686                 return XDP_ABORTED;
687         }
688 
689         /* Recalculate checksums. */
690         hdr->tcp->check = 0;
691         value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
692         if (value < 0)
693                 return XDP_ABORTED;
694         if (hdr->ipv4) {
695                 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
696                                                     hdr->ipv4->daddr,
697                                                     hdr->tcp_len,
698                                                     IPPROTO_TCP,
699                                                     value);
700 
701                 hdr->ipv4->check = 0;
702                 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
703                 if (value < 0)
704                         return XDP_ABORTED;
705                 hdr->ipv4->check = csum_fold(value);
706         } else if (hdr->ipv6) {
707                 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
708                                                   &hdr->ipv6->daddr,
709                                                   hdr->tcp_len,
710                                                   IPPROTO_TCP,
711                                                   value);
712         } else {
713                 return XDP_ABORTED;
714         }
715 
716         /* Set the new packet size. */
717         old_pkt_size = data_end - data;
718         new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
719         if (xdp) {
720                 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
721                         return XDP_ABORTED;
722         } else {
723                 if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
724                         return XDP_ABORTED;
725         }
726 
727         values_inc_synacks();
728 
729         return XDP_TX;
730 }
731 
732 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
733 {
734         int err;
735 
736         if (hdr->tcp->rst)
737                 return XDP_DROP;
738 
739         if (hdr->ipv4)
740                 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
741         else if (hdr->ipv6)
742                 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
743         else
744                 return XDP_ABORTED;
745         if (err)
746                 return XDP_DROP;
747 
748         return XDP_PASS;
749 }
750 
751 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
752                                            struct header_pointers *hdr, bool xdp)
753 {
754         int ret;
755 
756         ret = tcp_dissect(data, data_end, hdr);
757         if (ret != XDP_TX)
758                 return ret;
759 
760         ret = tcp_lookup(ctx, hdr, xdp);
761         if (ret != XDP_TX)
762                 return ret;
763 
764         /* Packet is TCP and doesn't belong to an established connection. */
765 
766         if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
767                 return XDP_DROP;
768 
769         /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
770          * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
771          */
772         if (xdp) {
773                 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
774                         return XDP_ABORTED;
775         } else {
776                 /* Without volatile the verifier throws this error:
777                  * R9 32-bit pointer arithmetic prohibited
778                  */
779                 volatile u64 old_len = data_end - data;
780 
781                 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
782                         return XDP_ABORTED;
783         }
784 
785         return XDP_TX;
786 }
787 
788 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
789                                            struct header_pointers *hdr, bool xdp)
790 {
791         if (hdr->ipv4) {
792                 hdr->eth = data;
793                 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
794                 /* IPV4_MAXLEN is needed when calculating checksum.
795                  * At least sizeof(struct iphdr) is needed here to access ihl.
796                  */
797                 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
798                         return XDP_ABORTED;
799                 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
800         } else if (hdr->ipv6) {
801                 hdr->eth = data;
802                 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
803                 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
804         } else {
805                 return XDP_ABORTED;
806         }
807 
808         if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
809                 return XDP_ABORTED;
810 
811         /* We run out of registers, tcp_len gets spilled to the stack, and the
812          * verifier forgets its min and max values checked above in tcp_dissect.
813          */
814         hdr->tcp_len = hdr->tcp->doff * 4;
815         if (hdr->tcp_len < sizeof(*hdr->tcp))
816                 return XDP_ABORTED;
817 
818         return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
819                                syncookie_handle_ack(hdr);
820 }
821 
822 SEC("xdp")
823 int syncookie_xdp(struct xdp_md *ctx)
824 {
825         void *data_end = (void *)(long)ctx->data_end;
826         void *data = (void *)(long)ctx->data;
827         struct header_pointers hdr;
828         int ret;
829 
830         ret = syncookie_part1(ctx, data, data_end, &hdr, true);
831         if (ret != XDP_TX)
832                 return ret;
833 
834         data_end = (void *)(long)ctx->data_end;
835         data = (void *)(long)ctx->data;
836 
837         return syncookie_part2(ctx, data, data_end, &hdr, true);
838 }
839 
840 SEC("tc")
841 int syncookie_tc(struct __sk_buff *skb)
842 {
843         void *data_end = (void *)(long)skb->data_end;
844         void *data = (void *)(long)skb->data;
845         struct header_pointers hdr;
846         int ret;
847 
848         ret = syncookie_part1(skb, data, data_end, &hdr, false);
849         if (ret != XDP_TX)
850                 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
851 
852         data_end = (void *)(long)skb->data_end;
853         data = (void *)(long)skb->data;
854 
855         ret = syncookie_part2(skb, data, data_end, &hdr, false);
856         switch (ret) {
857         case XDP_PASS:
858                 return TC_ACT_OK;
859         case XDP_TX:
860                 return bpf_redirect(skb->ifindex, 0);
861         default:
862                 return TC_ACT_SHOT;
863         }
864 }
865 
866 char _license[] SEC("license") = "GPL";
867 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php