1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause 2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ 3 4 #define BPF_NO_KFUNC_PROTOTYPES 5 #include "vmlinux.h" 6 7 #include <bpf/bpf_helpers.h> 8 #include <bpf/bpf_endian.h> 9 #include <asm/errno.h> 10 11 #include "bpf_compiler.h" 12 13 #define TC_ACT_OK 0 14 #define TC_ACT_SHOT 2 15 16 #define NSEC_PER_SEC 1000000000L 17 18 #define ETH_ALEN 6 19 #define ETH_P_IP 0x0800 20 #define ETH_P_IPV6 0x86DD 21 22 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) 23 24 #define IP_DF 0x4000 25 #define IP_MF 0x2000 26 #define IP_OFFSET 0x1fff 27 28 #define NEXTHDR_TCP 6 29 30 #define TCPOPT_NOP 1 31 #define TCPOPT_EOL 0 32 #define TCPOPT_MSS 2 33 #define TCPOPT_WINDOW 3 34 #define TCPOPT_SACK_PERM 4 35 #define TCPOPT_TIMESTAMP 8 36 37 #define TCPOLEN_MSS 4 38 #define TCPOLEN_WINDOW 3 39 #define TCPOLEN_SACK_PERM 2 40 #define TCPOLEN_TIMESTAMP 10 41 42 #define TCP_TS_HZ 1000 43 #define TS_OPT_WSCALE_MASK 0xf 44 #define TS_OPT_SACK (1 << 4) 45 #define TS_OPT_ECN (1 << 5) 46 #define TSBITS 6 47 #define TSMASK (((__u32)1 << TSBITS) - 1) 48 #define TCP_MAX_WSCALE 14U 49 50 #define IPV4_MAXLEN 60 51 #define TCP_MAXLEN 60 52 53 #define DEFAULT_MSS4 1460 54 #define DEFAULT_MSS6 1440 55 #define DEFAULT_WSCALE 7 56 #define DEFAULT_TTL 64 57 #define MAX_ALLOWED_PORTS 8 58 59 #define MAX_PACKET_OFF 0xffff 60 61 #define swap(a, b) \ 62 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 63 64 #define __get_unaligned_t(type, ptr) ({ \ 65 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ 66 __pptr->x; \ 67 }) 68 69 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) 70 71 struct { 72 __uint(type, BPF_MAP_TYPE_ARRAY); 73 __type(key, __u32); 74 __type(value, __u64); 75 __uint(max_entries, 2); 76 } values SEC(".maps"); 77 78 struct { 79 __uint(type, BPF_MAP_TYPE_ARRAY); 80 __type(key, __u32); 81 __type(value, __u16); 82 __uint(max_entries, MAX_ALLOWED_PORTS); 83 } allowed_ports SEC(".maps"); 84 85 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in 86 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. 87 */ 88 89 struct bpf_ct_opts___local { 90 s32 netns_id; 91 s32 error; 92 u8 l4proto; 93 u8 dir; 94 u8 reserved[2]; 95 } __attribute__((preserve_access_index)); 96 97 #define BPF_F_CURRENT_NETNS (-1) 98 99 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, 100 struct bpf_sock_tuple *bpf_tuple, 101 __u32 len_tuple, 102 struct bpf_ct_opts___local *opts, 103 __u32 len_opts) __ksym; 104 105 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, 106 struct bpf_sock_tuple *bpf_tuple, 107 u32 len_tuple, 108 struct bpf_ct_opts___local *opts, 109 u32 len_opts) __ksym; 110 111 extern void bpf_ct_release(struct nf_conn *ct) __ksym; 112 113 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) 114 { 115 __u8 tmp[ETH_ALEN]; 116 117 __builtin_memcpy(tmp, a, ETH_ALEN); 118 __builtin_memcpy(a, b, ETH_ALEN); 119 __builtin_memcpy(b, tmp, ETH_ALEN); 120 } 121 122 static __always_inline __u16 csum_fold(__u32 csum) 123 { 124 csum = (csum & 0xffff) + (csum >> 16); 125 csum = (csum & 0xffff) + (csum >> 16); 126 return (__u16)~csum; 127 } 128 129 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, 130 __u32 len, __u8 proto, 131 __u32 csum) 132 { 133 __u64 s = csum; 134 135 s += (__u32)saddr; 136 s += (__u32)daddr; 137 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 138 s += proto + len; 139 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 140 s += (proto + len) << 8; 141 #else 142 #error Unknown endian 143 #endif 144 s = (s & 0xffffffff) + (s >> 32); 145 s = (s & 0xffffffff) + (s >> 32); 146 147 return csum_fold((__u32)s); 148 } 149 150 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, 151 const struct in6_addr *daddr, 152 __u32 len, __u8 proto, __u32 csum) 153 { 154 __u64 sum = csum; 155 int i; 156 157 __pragma_loop_unroll 158 for (i = 0; i < 4; i++) 159 sum += (__u32)saddr->in6_u.u6_addr32[i]; 160 161 __pragma_loop_unroll 162 for (i = 0; i < 4; i++) 163 sum += (__u32)daddr->in6_u.u6_addr32[i]; 164 165 /* Don't combine additions to avoid 32-bit overflow. */ 166 sum += bpf_htonl(len); 167 sum += bpf_htonl(proto); 168 169 sum = (sum & 0xffffffff) + (sum >> 32); 170 sum = (sum & 0xffffffff) + (sum >> 32); 171 172 return csum_fold((__u32)sum); 173 } 174 175 static __always_inline __u64 tcp_clock_ns(void) 176 { 177 return bpf_ktime_get_ns(); 178 } 179 180 static __always_inline __u32 tcp_ns_to_ts(__u64 ns) 181 { 182 return ns / (NSEC_PER_SEC / TCP_TS_HZ); 183 } 184 185 static __always_inline __u32 tcp_clock_ms(void) 186 { 187 return tcp_ns_to_ts(tcp_clock_ns()); 188 } 189 190 struct tcpopt_context { 191 void *data; 192 void *data_end; 193 __be32 *tsecr; 194 __u8 wscale; 195 bool option_timestamp; 196 bool option_sack; 197 __u32 off; 198 }; 199 200 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) 201 { 202 __u64 off = ctx->off; 203 __u8 *data; 204 205 /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ 206 if (off > MAX_PACKET_OFF - sz) 207 return NULL; 208 209 data = ctx->data + off; 210 barrier_var(data); 211 if (data + sz >= ctx->data_end) 212 return NULL; 213 214 ctx->off += sz; 215 return data; 216 } 217 218 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) 219 { 220 __u8 *opcode, *opsize, *wscale, *tsecr; 221 __u32 off = ctx->off; 222 223 opcode = next(ctx, 1); 224 if (!opcode) 225 return 1; 226 227 if (*opcode == TCPOPT_EOL) 228 return 1; 229 if (*opcode == TCPOPT_NOP) 230 return 0; 231 232 opsize = next(ctx, 1); 233 if (!opsize || *opsize < 2) 234 return 1; 235 236 switch (*opcode) { 237 case TCPOPT_WINDOW: 238 wscale = next(ctx, 1); 239 if (!wscale) 240 return 1; 241 if (*opsize == TCPOLEN_WINDOW) 242 ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; 243 break; 244 case TCPOPT_TIMESTAMP: 245 tsecr = next(ctx, 4); 246 if (!tsecr) 247 return 1; 248 if (*opsize == TCPOLEN_TIMESTAMP) { 249 ctx->option_timestamp = true; 250 /* Client's tsval becomes our tsecr. */ 251 *ctx->tsecr = get_unaligned((__be32 *)tsecr); 252 } 253 break; 254 case TCPOPT_SACK_PERM: 255 if (*opsize == TCPOLEN_SACK_PERM) 256 ctx->option_sack = true; 257 break; 258 } 259 260 ctx->off = off + *opsize; 261 262 return 0; 263 } 264 265 static int tscookie_tcpopt_parse_batch(__u32 index, void *context) 266 { 267 int i; 268 269 for (i = 0; i < 7; i++) 270 if (tscookie_tcpopt_parse(context)) 271 return 1; 272 return 0; 273 } 274 275 static __always_inline bool tscookie_init(struct tcphdr *tcp_header, 276 __u16 tcp_len, __be32 *tsval, 277 __be32 *tsecr, void *data, void *data_end) 278 { 279 struct tcpopt_context loop_ctx = { 280 .data = data, 281 .data_end = data_end, 282 .tsecr = tsecr, 283 .wscale = TS_OPT_WSCALE_MASK, 284 .option_timestamp = false, 285 .option_sack = false, 286 /* Note: currently verifier would track .off as unbound scalar. 287 * In case if verifier would at some point get smarter and 288 * compute bounded value for this var, beware that it might 289 * hinder bpf_loop() convergence validation. 290 */ 291 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, 292 }; 293 u32 cookie; 294 295 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); 296 297 if (!loop_ctx.option_timestamp) 298 return false; 299 300 cookie = tcp_clock_ms() & ~TSMASK; 301 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; 302 if (loop_ctx.option_sack) 303 cookie |= TS_OPT_SACK; 304 if (tcp_header->ece && tcp_header->cwr) 305 cookie |= TS_OPT_ECN; 306 *tsval = bpf_htonl(cookie); 307 308 return true; 309 } 310 311 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, 312 __u8 *ttl, bool ipv6) 313 { 314 __u32 key = 0; 315 __u64 *value; 316 317 value = bpf_map_lookup_elem(&values, &key); 318 if (value && *value != 0) { 319 if (ipv6) 320 *mss = (*value >> 32) & 0xffff; 321 else 322 *mss = *value & 0xffff; 323 *wscale = (*value >> 16) & 0xf; 324 *ttl = (*value >> 24) & 0xff; 325 return; 326 } 327 328 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; 329 *wscale = DEFAULT_WSCALE; 330 *ttl = DEFAULT_TTL; 331 } 332 333 static __always_inline void values_inc_synacks(void) 334 { 335 __u32 key = 1; 336 __u64 *value; 337 338 value = bpf_map_lookup_elem(&values, &key); 339 if (value) 340 __sync_fetch_and_add(value, 1); 341 } 342 343 static __always_inline bool check_port_allowed(__u16 port) 344 { 345 __u32 i; 346 347 for (i = 0; i < MAX_ALLOWED_PORTS; i++) { 348 __u32 key = i; 349 __u16 *value; 350 351 value = bpf_map_lookup_elem(&allowed_ports, &key); 352 353 if (!value) 354 break; 355 /* 0 is a terminator value. Check it first to avoid matching on 356 * a forbidden port == 0 and returning true. 357 */ 358 if (*value == 0) 359 break; 360 361 if (*value == port) 362 return true; 363 } 364 365 return false; 366 } 367 368 struct header_pointers { 369 struct ethhdr *eth; 370 struct iphdr *ipv4; 371 struct ipv6hdr *ipv6; 372 struct tcphdr *tcp; 373 __u16 tcp_len; 374 }; 375 376 static __always_inline int tcp_dissect(void *data, void *data_end, 377 struct header_pointers *hdr) 378 { 379 hdr->eth = data; 380 if (hdr->eth + 1 > data_end) 381 return XDP_DROP; 382 383 switch (bpf_ntohs(hdr->eth->h_proto)) { 384 case ETH_P_IP: 385 hdr->ipv6 = NULL; 386 387 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 388 if (hdr->ipv4 + 1 > data_end) 389 return XDP_DROP; 390 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) 391 return XDP_DROP; 392 if (hdr->ipv4->version != 4) 393 return XDP_DROP; 394 395 if (hdr->ipv4->protocol != IPPROTO_TCP) 396 return XDP_PASS; 397 398 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 399 break; 400 case ETH_P_IPV6: 401 hdr->ipv4 = NULL; 402 403 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 404 if (hdr->ipv6 + 1 > data_end) 405 return XDP_DROP; 406 if (hdr->ipv6->version != 6) 407 return XDP_DROP; 408 409 /* XXX: Extension headers are not supported and could circumvent 410 * XDP SYN flood protection. 411 */ 412 if (hdr->ipv6->nexthdr != NEXTHDR_TCP) 413 return XDP_PASS; 414 415 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 416 break; 417 default: 418 /* XXX: VLANs will circumvent XDP SYN flood protection. */ 419 return XDP_PASS; 420 } 421 422 if (hdr->tcp + 1 > data_end) 423 return XDP_DROP; 424 hdr->tcp_len = hdr->tcp->doff * 4; 425 if (hdr->tcp_len < sizeof(*hdr->tcp)) 426 return XDP_DROP; 427 428 return XDP_TX; 429 } 430 431 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) 432 { 433 struct bpf_ct_opts___local ct_lookup_opts = { 434 .netns_id = BPF_F_CURRENT_NETNS, 435 .l4proto = IPPROTO_TCP, 436 }; 437 struct bpf_sock_tuple tup = {}; 438 struct nf_conn *ct; 439 __u32 tup_size; 440 441 if (hdr->ipv4) { 442 /* TCP doesn't normally use fragments, and XDP can't reassemble 443 * them. 444 */ 445 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) 446 return XDP_DROP; 447 448 tup.ipv4.saddr = hdr->ipv4->saddr; 449 tup.ipv4.daddr = hdr->ipv4->daddr; 450 tup.ipv4.sport = hdr->tcp->source; 451 tup.ipv4.dport = hdr->tcp->dest; 452 tup_size = sizeof(tup.ipv4); 453 } else if (hdr->ipv6) { 454 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); 455 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); 456 tup.ipv6.sport = hdr->tcp->source; 457 tup.ipv6.dport = hdr->tcp->dest; 458 tup_size = sizeof(tup.ipv6); 459 } else { 460 /* The verifier can't track that either ipv4 or ipv6 is not 461 * NULL. 462 */ 463 return XDP_ABORTED; 464 } 465 if (xdp) 466 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 467 else 468 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 469 if (ct) { 470 unsigned long status = ct->status; 471 472 bpf_ct_release(ct); 473 if (status & IPS_CONFIRMED) 474 return XDP_PASS; 475 } else if (ct_lookup_opts.error != -ENOENT) { 476 return XDP_ABORTED; 477 } 478 479 /* error == -ENOENT || !(status & IPS_CONFIRMED) */ 480 return XDP_TX; 481 } 482 483 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, 484 __u8 wscale) 485 { 486 __be32 *start = buf; 487 488 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); 489 490 if (!tsopt) 491 return buf - start; 492 493 if (tsopt[0] & bpf_htonl(1 << 4)) 494 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | 495 (TCPOLEN_SACK_PERM << 16) | 496 (TCPOPT_TIMESTAMP << 8) | 497 TCPOLEN_TIMESTAMP); 498 else 499 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 500 (TCPOPT_NOP << 16) | 501 (TCPOPT_TIMESTAMP << 8) | 502 TCPOLEN_TIMESTAMP); 503 *buf++ = tsopt[0]; 504 *buf++ = tsopt[1]; 505 506 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) 507 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 508 (TCPOPT_WINDOW << 16) | 509 (TCPOLEN_WINDOW << 8) | 510 wscale); 511 512 return buf - start; 513 } 514 515 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, 516 __u32 cookie, __be32 *tsopt, 517 __u16 mss, __u8 wscale) 518 { 519 void *tcp_options; 520 521 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; 522 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) 523 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; 524 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ 525 swap(tcp_header->source, tcp_header->dest); 526 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); 527 tcp_header->seq = bpf_htonl(cookie); 528 tcp_header->window = 0; 529 tcp_header->urg_ptr = 0; 530 tcp_header->check = 0; /* Calculate checksum later. */ 531 532 tcp_options = (void *)(tcp_header + 1); 533 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); 534 } 535 536 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, 537 __u32 cookie, __be32 *tsopt) 538 { 539 __u8 wscale; 540 __u16 mss; 541 __u8 ttl; 542 543 values_get_tcpipopts(&mss, &wscale, &ttl, false); 544 545 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 546 547 swap(hdr->ipv4->saddr, hdr->ipv4->daddr); 548 hdr->ipv4->check = 0; /* Calculate checksum later. */ 549 hdr->ipv4->tos = 0; 550 hdr->ipv4->id = 0; 551 hdr->ipv4->ttl = ttl; 552 553 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 554 555 hdr->tcp_len = hdr->tcp->doff * 4; 556 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); 557 } 558 559 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, 560 __u32 cookie, __be32 *tsopt) 561 { 562 __u8 wscale; 563 __u16 mss; 564 __u8 ttl; 565 566 values_get_tcpipopts(&mss, &wscale, &ttl, true); 567 568 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 569 570 swap(hdr->ipv6->saddr, hdr->ipv6->daddr); 571 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); 572 hdr->ipv6->hop_limit = ttl; 573 574 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 575 576 hdr->tcp_len = hdr->tcp->doff * 4; 577 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); 578 } 579 580 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, 581 void *ctx, 582 void *data, void *data_end, 583 bool xdp) 584 { 585 __u32 old_pkt_size, new_pkt_size; 586 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the 587 * BPF verifier if tsopt is not volatile. Volatile forces it to store 588 * the pointer value and use it directly, otherwise tcp_mkoptions is 589 * (mis)compiled like this: 590 * if (!tsopt) 591 * return buf - start; 592 * reg = stored_return_value_of_tscookie_init; 593 * if (reg) 594 * tsopt = tsopt_buf; 595 * else 596 * tsopt = NULL; 597 * ... 598 * *buf++ = tsopt[1]; 599 * It creates a dead branch where tsopt is assigned NULL, but the 600 * verifier can't prove it's dead and blocks the program. 601 */ 602 __be32 * volatile tsopt = NULL; 603 __be32 tsopt_buf[2] = {}; 604 __u16 ip_len; 605 __u32 cookie; 606 __s64 value; 607 608 /* Checksum is not yet verified, but both checksum failure and TCP 609 * header checks return XDP_DROP, so the order doesn't matter. 610 */ 611 if (hdr->tcp->fin || hdr->tcp->rst) 612 return XDP_DROP; 613 614 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked 615 * ports. 616 */ 617 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) 618 return XDP_DROP; 619 620 if (hdr->ipv4) { 621 /* Check the IPv4 and TCP checksums before creating a SYNACK. */ 622 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); 623 if (value < 0) 624 return XDP_ABORTED; 625 if (csum_fold(value) != 0) 626 return XDP_DROP; /* Bad IPv4 checksum. */ 627 628 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 629 if (value < 0) 630 return XDP_ABORTED; 631 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, 632 hdr->tcp_len, IPPROTO_TCP, value) != 0) 633 return XDP_DROP; /* Bad TCP checksum. */ 634 635 ip_len = sizeof(*hdr->ipv4); 636 637 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, 638 hdr->tcp_len); 639 } else if (hdr->ipv6) { 640 /* Check the TCP checksum before creating a SYNACK. */ 641 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 642 if (value < 0) 643 return XDP_ABORTED; 644 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, 645 hdr->tcp_len, IPPROTO_TCP, value) != 0) 646 return XDP_DROP; /* Bad TCP checksum. */ 647 648 ip_len = sizeof(*hdr->ipv6); 649 650 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, 651 hdr->tcp_len); 652 } else { 653 return XDP_ABORTED; 654 } 655 656 if (value < 0) 657 return XDP_ABORTED; 658 cookie = (__u32)value; 659 660 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, 661 &tsopt_buf[0], &tsopt_buf[1], data, data_end)) 662 tsopt = tsopt_buf; 663 664 /* Check that there is enough space for a SYNACK. It also covers 665 * the check that the destination of the __builtin_memmove below 666 * doesn't overflow. 667 */ 668 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) 669 return XDP_ABORTED; 670 671 if (hdr->ipv4) { 672 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { 673 struct tcphdr *new_tcp_header; 674 675 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); 676 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); 677 hdr->tcp = new_tcp_header; 678 679 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; 680 } 681 682 tcpv4_gen_synack(hdr, cookie, tsopt); 683 } else if (hdr->ipv6) { 684 tcpv6_gen_synack(hdr, cookie, tsopt); 685 } else { 686 return XDP_ABORTED; 687 } 688 689 /* Recalculate checksums. */ 690 hdr->tcp->check = 0; 691 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 692 if (value < 0) 693 return XDP_ABORTED; 694 if (hdr->ipv4) { 695 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, 696 hdr->ipv4->daddr, 697 hdr->tcp_len, 698 IPPROTO_TCP, 699 value); 700 701 hdr->ipv4->check = 0; 702 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); 703 if (value < 0) 704 return XDP_ABORTED; 705 hdr->ipv4->check = csum_fold(value); 706 } else if (hdr->ipv6) { 707 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, 708 &hdr->ipv6->daddr, 709 hdr->tcp_len, 710 IPPROTO_TCP, 711 value); 712 } else { 713 return XDP_ABORTED; 714 } 715 716 /* Set the new packet size. */ 717 old_pkt_size = data_end - data; 718 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; 719 if (xdp) { 720 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) 721 return XDP_ABORTED; 722 } else { 723 if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) 724 return XDP_ABORTED; 725 } 726 727 values_inc_synacks(); 728 729 return XDP_TX; 730 } 731 732 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) 733 { 734 int err; 735 736 if (hdr->tcp->rst) 737 return XDP_DROP; 738 739 if (hdr->ipv4) 740 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); 741 else if (hdr->ipv6) 742 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); 743 else 744 return XDP_ABORTED; 745 if (err) 746 return XDP_DROP; 747 748 return XDP_PASS; 749 } 750 751 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, 752 struct header_pointers *hdr, bool xdp) 753 { 754 int ret; 755 756 ret = tcp_dissect(data, data_end, hdr); 757 if (ret != XDP_TX) 758 return ret; 759 760 ret = tcp_lookup(ctx, hdr, xdp); 761 if (ret != XDP_TX) 762 return ret; 763 764 /* Packet is TCP and doesn't belong to an established connection. */ 765 766 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) 767 return XDP_DROP; 768 769 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len 770 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. 771 */ 772 if (xdp) { 773 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) 774 return XDP_ABORTED; 775 } else { 776 /* Without volatile the verifier throws this error: 777 * R9 32-bit pointer arithmetic prohibited 778 */ 779 volatile u64 old_len = data_end - data; 780 781 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) 782 return XDP_ABORTED; 783 } 784 785 return XDP_TX; 786 } 787 788 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, 789 struct header_pointers *hdr, bool xdp) 790 { 791 if (hdr->ipv4) { 792 hdr->eth = data; 793 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 794 /* IPV4_MAXLEN is needed when calculating checksum. 795 * At least sizeof(struct iphdr) is needed here to access ihl. 796 */ 797 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) 798 return XDP_ABORTED; 799 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 800 } else if (hdr->ipv6) { 801 hdr->eth = data; 802 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 803 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 804 } else { 805 return XDP_ABORTED; 806 } 807 808 if ((void *)hdr->tcp + TCP_MAXLEN > data_end) 809 return XDP_ABORTED; 810 811 /* We run out of registers, tcp_len gets spilled to the stack, and the 812 * verifier forgets its min and max values checked above in tcp_dissect. 813 */ 814 hdr->tcp_len = hdr->tcp->doff * 4; 815 if (hdr->tcp_len < sizeof(*hdr->tcp)) 816 return XDP_ABORTED; 817 818 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : 819 syncookie_handle_ack(hdr); 820 } 821 822 SEC("xdp") 823 int syncookie_xdp(struct xdp_md *ctx) 824 { 825 void *data_end = (void *)(long)ctx->data_end; 826 void *data = (void *)(long)ctx->data; 827 struct header_pointers hdr; 828 int ret; 829 830 ret = syncookie_part1(ctx, data, data_end, &hdr, true); 831 if (ret != XDP_TX) 832 return ret; 833 834 data_end = (void *)(long)ctx->data_end; 835 data = (void *)(long)ctx->data; 836 837 return syncookie_part2(ctx, data, data_end, &hdr, true); 838 } 839 840 SEC("tc") 841 int syncookie_tc(struct __sk_buff *skb) 842 { 843 void *data_end = (void *)(long)skb->data_end; 844 void *data = (void *)(long)skb->data; 845 struct header_pointers hdr; 846 int ret; 847 848 ret = syncookie_part1(skb, data, data_end, &hdr, false); 849 if (ret != XDP_TX) 850 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; 851 852 data_end = (void *)(long)skb->data_end; 853 data = (void *)(long)skb->data; 854 855 ret = syncookie_part2(skb, data, data_end, &hdr, false); 856 switch (ret) { 857 case XDP_PASS: 858 return TC_ACT_OK; 859 case XDP_TX: 860 return bpf_redirect(skb->ifindex, 0); 861 default: 862 return TC_ACT_SHOT; 863 } 864 } 865 866 char _license[] SEC("license") = "GPL"; 867
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.