1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * CAIA Delay-Gradient (CDG) congestion control 4 * 5 * This implementation is based on the paper: 6 * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using 7 * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. 8 * 9 * Scavenger traffic (Less-than-Best-Effort) should disable coexistence 10 * heuristics using parameters use_shadow=0 and use_ineff=0. 11 * 12 * Parameters window, backoff_beta, and backoff_factor are crucial for 13 * throughput and delay. Future work is needed to determine better defaults, 14 * and to provide guidelines for use in different environments/contexts. 15 * 16 * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. 17 * Parameter window is only configurable when loading tcp_cdg as a module. 18 * 19 * Notable differences from paper/FreeBSD: 20 * o Using Hybrid Slow start and Proportional Rate Reduction. 21 * o Add toggle for shadow window mechanism. Suggested by David Hayes. 22 * o Add toggle for non-congestion loss tolerance. 23 * o Scaling parameter G is changed to a backoff factor; 24 * conversion is given by: backoff_factor = 1000/(G * window). 25 * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. 26 * o More accurate e^-x. 27 */ 28 #include <linux/kernel.h> 29 #include <linux/random.h> 30 #include <linux/module.h> 31 #include <linux/sched/clock.h> 32 33 #include <net/tcp.h> 34 35 #define HYSTART_ACK_TRAIN 1 36 #define HYSTART_DELAY 2 37 38 static int window __read_mostly = 8; 39 static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ 40 static unsigned int backoff_factor __read_mostly = 42; 41 static unsigned int hystart_detect __read_mostly = 3; 42 static unsigned int use_ineff __read_mostly = 5; 43 static bool use_shadow __read_mostly = true; 44 static bool use_tolerance __read_mostly; 45 46 module_param(window, int, 0444); 47 MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); 48 module_param(backoff_beta, uint, 0644); 49 MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); 50 module_param(backoff_factor, uint, 0644); 51 MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); 52 module_param(hystart_detect, uint, 0644); 53 MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " 54 "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); 55 module_param(use_ineff, uint, 0644); 56 MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); 57 module_param(use_shadow, bool, 0644); 58 MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); 59 module_param(use_tolerance, bool, 0644); 60 MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); 61 62 struct cdg_minmax { 63 union { 64 struct { 65 s32 min; 66 s32 max; 67 }; 68 u64 v64; 69 }; 70 }; 71 72 enum cdg_state { 73 CDG_UNKNOWN = 0, 74 CDG_NONFULL = 1, 75 CDG_FULL = 2, 76 CDG_BACKOFF = 3, 77 }; 78 79 struct cdg { 80 struct cdg_minmax rtt; 81 struct cdg_minmax rtt_prev; 82 struct cdg_minmax *gradients; 83 struct cdg_minmax gsum; 84 bool gfilled; 85 u8 tail; 86 u8 state; 87 u8 delack; 88 u32 rtt_seq; 89 u32 shadow_wnd; 90 u16 backoff_cnt; 91 u16 sample_cnt; 92 s32 delay_min; 93 u32 last_ack; 94 u32 round_start; 95 }; 96 97 /** 98 * nexp_u32 - negative base-e exponential 99 * @ux: x in units of micro 100 * 101 * Returns exp(ux * -1e-6) * U32_MAX. 102 */ 103 static u32 __pure nexp_u32(u32 ux) 104 { 105 static const u16 v[] = { 106 /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ 107 65535, 108 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, 109 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, 110 }; 111 u32 msb = ux >> 8; 112 u32 res; 113 int i; 114 115 /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ 116 if (msb > U16_MAX) 117 return 0; 118 119 /* Scale first eight bits linearly: */ 120 res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); 121 122 /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ 123 for (i = 1; msb; i++, msb >>= 1) { 124 u32 y = v[i & -(msb & 1)] + U32_C(1); 125 126 res = ((u64)res * y) >> 16; 127 } 128 129 return res; 130 } 131 132 /* Based on the HyStart algorithm (by Ha et al.) that is implemented in 133 * tcp_cubic. Differences/experimental changes: 134 * o Using Hayes' delayed ACK filter. 135 * o Using a usec clock for the ACK train. 136 * o Reset ACK train when application limited. 137 * o Invoked at any cwnd (i.e. also when cwnd < 16). 138 * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). 139 */ 140 static void tcp_cdg_hystart_update(struct sock *sk) 141 { 142 struct cdg *ca = inet_csk_ca(sk); 143 struct tcp_sock *tp = tcp_sk(sk); 144 145 ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); 146 if (ca->delay_min == 0) 147 return; 148 149 if (hystart_detect & HYSTART_ACK_TRAIN) { 150 u32 now_us = tp->tcp_mstamp; 151 152 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { 153 ca->last_ack = now_us; 154 ca->round_start = now_us; 155 } else if (before(now_us, ca->last_ack + 3000)) { 156 u32 base_owd = max(ca->delay_min / 2U, 125U); 157 158 ca->last_ack = now_us; 159 if (after(now_us, ca->round_start + base_owd)) { 160 NET_INC_STATS(sock_net(sk), 161 LINUX_MIB_TCPHYSTARTTRAINDETECT); 162 NET_ADD_STATS(sock_net(sk), 163 LINUX_MIB_TCPHYSTARTTRAINCWND, 164 tcp_snd_cwnd(tp)); 165 tp->snd_ssthresh = tcp_snd_cwnd(tp); 166 return; 167 } 168 } 169 } 170 171 if (hystart_detect & HYSTART_DELAY) { 172 if (ca->sample_cnt < 8) { 173 ca->sample_cnt++; 174 } else { 175 s32 thresh = max(ca->delay_min + ca->delay_min / 8U, 176 125U); 177 178 if (ca->rtt.min > thresh) { 179 NET_INC_STATS(sock_net(sk), 180 LINUX_MIB_TCPHYSTARTDELAYDETECT); 181 NET_ADD_STATS(sock_net(sk), 182 LINUX_MIB_TCPHYSTARTDELAYCWND, 183 tcp_snd_cwnd(tp)); 184 tp->snd_ssthresh = tcp_snd_cwnd(tp); 185 } 186 } 187 } 188 } 189 190 static s32 tcp_cdg_grad(struct cdg *ca) 191 { 192 s32 gmin = ca->rtt.min - ca->rtt_prev.min; 193 s32 gmax = ca->rtt.max - ca->rtt_prev.max; 194 s32 grad; 195 196 if (ca->gradients) { 197 ca->gsum.min += gmin - ca->gradients[ca->tail].min; 198 ca->gsum.max += gmax - ca->gradients[ca->tail].max; 199 ca->gradients[ca->tail].min = gmin; 200 ca->gradients[ca->tail].max = gmax; 201 ca->tail = (ca->tail + 1) & (window - 1); 202 gmin = ca->gsum.min; 203 gmax = ca->gsum.max; 204 } 205 206 /* We keep sums to ignore gradients during cwnd reductions; 207 * the paper's smoothed gradients otherwise simplify to: 208 * (rtt_latest - rtt_oldest) / window. 209 * 210 * We also drop division by window here. 211 */ 212 grad = gmin > 0 ? gmin : gmax; 213 214 /* Extrapolate missing values in gradient window: */ 215 if (!ca->gfilled) { 216 if (!ca->gradients && window > 1) 217 grad *= window; /* Memory allocation failed. */ 218 else if (ca->tail == 0) 219 ca->gfilled = true; 220 else 221 grad = (grad * window) / (int)ca->tail; 222 } 223 224 /* Backoff was effectual: */ 225 if (gmin <= -32 || gmax <= -32) 226 ca->backoff_cnt = 0; 227 228 if (use_tolerance) { 229 /* Reduce small variations to zero: */ 230 gmin = DIV_ROUND_CLOSEST(gmin, 64); 231 gmax = DIV_ROUND_CLOSEST(gmax, 64); 232 233 if (gmin > 0 && gmax <= 0) 234 ca->state = CDG_FULL; 235 else if ((gmin > 0 && gmax > 0) || gmax < 0) 236 ca->state = CDG_NONFULL; 237 } 238 return grad; 239 } 240 241 static bool tcp_cdg_backoff(struct sock *sk, u32 grad) 242 { 243 struct cdg *ca = inet_csk_ca(sk); 244 struct tcp_sock *tp = tcp_sk(sk); 245 246 if (get_random_u32() <= nexp_u32(grad * backoff_factor)) 247 return false; 248 249 if (use_ineff) { 250 ca->backoff_cnt++; 251 if (ca->backoff_cnt > use_ineff) 252 return false; 253 } 254 255 ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp)); 256 ca->state = CDG_BACKOFF; 257 tcp_enter_cwr(sk); 258 return true; 259 } 260 261 /* Not called in CWR or Recovery state. */ 262 static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) 263 { 264 struct cdg *ca = inet_csk_ca(sk); 265 struct tcp_sock *tp = tcp_sk(sk); 266 u32 prior_snd_cwnd; 267 u32 incr; 268 269 if (tcp_in_slow_start(tp) && hystart_detect) 270 tcp_cdg_hystart_update(sk); 271 272 if (after(ack, ca->rtt_seq) && ca->rtt.v64) { 273 s32 grad = 0; 274 275 if (ca->rtt_prev.v64) 276 grad = tcp_cdg_grad(ca); 277 ca->rtt_seq = tp->snd_nxt; 278 ca->rtt_prev = ca->rtt; 279 ca->rtt.v64 = 0; 280 ca->last_ack = 0; 281 ca->sample_cnt = 0; 282 283 if (grad > 0 && tcp_cdg_backoff(sk, grad)) 284 return; 285 } 286 287 if (!tcp_is_cwnd_limited(sk)) { 288 ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp)); 289 return; 290 } 291 292 prior_snd_cwnd = tcp_snd_cwnd(tp); 293 tcp_reno_cong_avoid(sk, ack, acked); 294 295 incr = tcp_snd_cwnd(tp) - prior_snd_cwnd; 296 ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); 297 } 298 299 static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) 300 { 301 struct cdg *ca = inet_csk_ca(sk); 302 struct tcp_sock *tp = tcp_sk(sk); 303 304 if (sample->rtt_us <= 0) 305 return; 306 307 /* A heuristic for filtering delayed ACKs, adapted from: 308 * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support 309 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. 310 */ 311 if (tp->sacked_out == 0) { 312 if (sample->pkts_acked == 1 && ca->delack) { 313 /* A delayed ACK is only used for the minimum if it is 314 * provenly lower than an existing non-zero minimum. 315 */ 316 ca->rtt.min = min(ca->rtt.min, sample->rtt_us); 317 ca->delack--; 318 return; 319 } else if (sample->pkts_acked > 1 && ca->delack < 5) { 320 ca->delack++; 321 } 322 } 323 324 ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); 325 ca->rtt.max = max(ca->rtt.max, sample->rtt_us); 326 } 327 328 static u32 tcp_cdg_ssthresh(struct sock *sk) 329 { 330 struct cdg *ca = inet_csk_ca(sk); 331 struct tcp_sock *tp = tcp_sk(sk); 332 333 if (ca->state == CDG_BACKOFF) 334 return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10); 335 336 if (ca->state == CDG_NONFULL && use_tolerance) 337 return tcp_snd_cwnd(tp); 338 339 ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp)); 340 if (use_shadow) 341 return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1); 342 return max(2U, tcp_snd_cwnd(tp) >> 1); 343 } 344 345 static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) 346 { 347 struct cdg *ca = inet_csk_ca(sk); 348 struct tcp_sock *tp = tcp_sk(sk); 349 struct cdg_minmax *gradients; 350 351 switch (ev) { 352 case CA_EVENT_CWND_RESTART: 353 gradients = ca->gradients; 354 if (gradients) 355 memset(gradients, 0, window * sizeof(gradients[0])); 356 memset(ca, 0, sizeof(*ca)); 357 358 ca->gradients = gradients; 359 ca->rtt_seq = tp->snd_nxt; 360 ca->shadow_wnd = tcp_snd_cwnd(tp); 361 break; 362 case CA_EVENT_COMPLETE_CWR: 363 ca->state = CDG_UNKNOWN; 364 ca->rtt_seq = tp->snd_nxt; 365 ca->rtt_prev = ca->rtt; 366 ca->rtt.v64 = 0; 367 break; 368 default: 369 break; 370 } 371 } 372 373 static void tcp_cdg_init(struct sock *sk) 374 { 375 struct cdg *ca = inet_csk_ca(sk); 376 struct tcp_sock *tp = tcp_sk(sk); 377 378 ca->gradients = NULL; 379 /* We silently fall back to window = 1 if allocation fails. */ 380 if (window > 1) 381 ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), 382 GFP_NOWAIT | __GFP_NOWARN); 383 ca->rtt_seq = tp->snd_nxt; 384 ca->shadow_wnd = tcp_snd_cwnd(tp); 385 } 386 387 static void tcp_cdg_release(struct sock *sk) 388 { 389 struct cdg *ca = inet_csk_ca(sk); 390 391 kfree(ca->gradients); 392 ca->gradients = NULL; 393 } 394 395 static struct tcp_congestion_ops tcp_cdg __read_mostly = { 396 .cong_avoid = tcp_cdg_cong_avoid, 397 .cwnd_event = tcp_cdg_cwnd_event, 398 .pkts_acked = tcp_cdg_acked, 399 .undo_cwnd = tcp_reno_undo_cwnd, 400 .ssthresh = tcp_cdg_ssthresh, 401 .release = tcp_cdg_release, 402 .init = tcp_cdg_init, 403 .owner = THIS_MODULE, 404 .name = "cdg", 405 }; 406 407 static int __init tcp_cdg_register(void) 408 { 409 if (backoff_beta > 1024 || window < 1 || window > 256) 410 return -ERANGE; 411 if (!is_power_of_2(window)) 412 return -EINVAL; 413 414 BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); 415 tcp_register_congestion_control(&tcp_cdg); 416 return 0; 417 } 418 419 static void __exit tcp_cdg_unregister(void) 420 { 421 tcp_unregister_congestion_control(&tcp_cdg); 422 } 423 424 module_init(tcp_cdg_register); 425 module_exit(tcp_cdg_unregister); 426 MODULE_AUTHOR("Kenneth Klette Jonassen"); 427 MODULE_LICENSE("GPL"); 428 MODULE_DESCRIPTION("TCP CDG"); 429
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.