1 // SPDX-License-Identifier: GPL-2.0-only 2 3 /* Highlights: 4 * 1. The major difference between this bpf program and tcp_cubic.c 5 * is that this bpf program relies on `cong_control` rather than 6 * `cong_avoid` in the struct tcp_congestion_ops. 7 * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and 8 * tcp_update_pacing_rate is bypassed when `cong_control` is 9 * defined, so moving these logic to `cong_control`. 10 * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c. 11 * The main purpose is to show use cases of the arguments in 12 * `cong_control`. For simplicity's sake, it reuses tcp cubic's 13 * kernel functions. 14 */ 15 16 #include "bpf_tracing_net.h" 17 #include <bpf/bpf_helpers.h> 18 #include <bpf/bpf_tracing.h> 19 20 #define USEC_PER_SEC 1000000UL 21 #define TCP_PACING_SS_RATIO (200) 22 #define TCP_PACING_CA_RATIO (120) 23 #define TCP_REORDERING (12) 24 25 #define min(a, b) ((a) < (b) ? (a) : (b)) 26 #define max(a, b) ((a) > (b) ? (a) : (b)) 27 #define after(seq2, seq1) before(seq1, seq2) 28 29 extern void cubictcp_init(struct sock *sk) __ksym; 30 extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; 31 extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; 32 extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; 33 extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; 34 extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; 35 extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; 36 37 static bool before(__u32 seq1, __u32 seq2) 38 { 39 return (__s32)(seq1-seq2) < 0; 40 } 41 42 static __u64 div64_u64(__u64 dividend, __u64 divisor) 43 { 44 return dividend / divisor; 45 } 46 47 static void tcp_update_pacing_rate(struct sock *sk) 48 { 49 const struct tcp_sock *tp = tcp_sk(sk); 50 __u64 rate; 51 52 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ 53 rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); 54 55 /* current rate is (cwnd * mss) / srtt 56 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. 57 * In Congestion Avoidance phase, set it to 120 % the current rate. 58 * 59 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) 60 * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching 61 * end of slow start and should slow down. 62 */ 63 if (tp->snd_cwnd < tp->snd_ssthresh / 2) 64 rate *= TCP_PACING_SS_RATIO; 65 else 66 rate *= TCP_PACING_CA_RATIO; 67 68 rate *= max(tp->snd_cwnd, tp->packets_out); 69 70 if (tp->srtt_us) 71 rate = div64_u64(rate, (__u64)tp->srtt_us); 72 73 sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); 74 } 75 76 static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, 77 int newly_lost, int flag) 78 { 79 struct tcp_sock *tp = tcp_sk(sk); 80 int sndcnt = 0; 81 __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; 82 int delta = tp->snd_ssthresh - pkts_in_flight; 83 84 if (newly_acked_sacked <= 0 || !tp->prior_cwnd) 85 return; 86 87 __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; 88 89 if (delta < 0) { 90 __u64 dividend = 91 (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; 92 sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; 93 } else { 94 sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); 95 if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) 96 sndcnt++; 97 sndcnt = min(delta, sndcnt); 98 } 99 /* Force a fast retransmit upon entering fast recovery */ 100 sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); 101 tp->snd_cwnd = pkts_in_flight + sndcnt; 102 } 103 104 /* Decide wheather to run the increase function of congestion control. */ 105 static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) 106 { 107 if (tcp_sk(sk)->reordering > TCP_REORDERING) 108 return flag & FLAG_FORWARD_PROGRESS; 109 110 return flag & FLAG_DATA_ACKED; 111 } 112 113 SEC("struct_ops") 114 void BPF_PROG(bpf_cubic_init, struct sock *sk) 115 { 116 cubictcp_init(sk); 117 } 118 119 SEC("struct_ops") 120 void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) 121 { 122 cubictcp_cwnd_event(sk, event); 123 } 124 125 SEC("struct_ops") 126 void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, 127 const struct rate_sample *rs) 128 { 129 struct tcp_sock *tp = tcp_sk(sk); 130 131 if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) & 132 (1 << inet_csk(sk)->icsk_ca_state)) { 133 /* Reduce cwnd if state mandates */ 134 tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); 135 136 if (!before(tp->snd_una, tp->high_seq)) { 137 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ 138 if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && 139 inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { 140 tp->snd_cwnd = tp->snd_ssthresh; 141 tp->snd_cwnd_stamp = tcp_jiffies32; 142 } 143 } 144 } else if (tcp_may_raise_cwnd(sk, flag)) { 145 /* Advance cwnd if state allows */ 146 cubictcp_cong_avoid(sk, ack, rs->acked_sacked); 147 tp->snd_cwnd_stamp = tcp_jiffies32; 148 } 149 150 tcp_update_pacing_rate(sk); 151 } 152 153 SEC("struct_ops") 154 __u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) 155 { 156 return cubictcp_recalc_ssthresh(sk); 157 } 158 159 SEC("struct_ops") 160 void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) 161 { 162 cubictcp_state(sk, new_state); 163 } 164 165 SEC("struct_ops") 166 void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) 167 { 168 cubictcp_acked(sk, sample); 169 } 170 171 SEC("struct_ops") 172 __u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) 173 { 174 return tcp_reno_undo_cwnd(sk); 175 } 176 177 SEC(".struct_ops") 178 struct tcp_congestion_ops cc_cubic = { 179 .init = (void *)bpf_cubic_init, 180 .ssthresh = (void *)bpf_cubic_recalc_ssthresh, 181 .cong_control = (void *)bpf_cubic_cong_control, 182 .set_state = (void *)bpf_cubic_state, 183 .undo_cwnd = (void *)bpf_cubic_undo_cwnd, 184 .cwnd_event = (void *)bpf_cubic_cwnd_event, 185 .pkts_acked = (void *)bpf_cubic_acked, 186 .name = "bpf_cc_cubic", 187 }; 188 189 char _license[] SEC("license") = "GPL"; 190
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.