1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VMware vSockets Driver 4 * 5 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/socket.h> 10 #include <linux/stddef.h> 11 #include <net/sock.h> 12 13 #include "vmci_transport_notify.h" 14 15 #define PKT_FIELD(vsk, field_name) \ 16 (vmci_trans(vsk)->notify.pkt_q_state.field_name) 17 18 static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) 19 { 20 bool retval; 21 u64 notify_limit; 22 23 if (!PKT_FIELD(vsk, peer_waiting_write)) 24 return false; 25 26 /* When the sender blocks, we take that as a sign that the sender is 27 * faster than the receiver. To reduce the transmit rate of the sender, 28 * we delay the sending of the read notification by decreasing the 29 * write_notify_window. The notification is delayed until the number of 30 * bytes used in the queue drops below the write_notify_window. 31 */ 32 33 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { 34 PKT_FIELD(vsk, peer_waiting_write_detected) = true; 35 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { 36 PKT_FIELD(vsk, write_notify_window) = 37 PKT_FIELD(vsk, write_notify_min_window); 38 } else { 39 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; 40 if (PKT_FIELD(vsk, write_notify_window) < 41 PKT_FIELD(vsk, write_notify_min_window)) 42 PKT_FIELD(vsk, write_notify_window) = 43 PKT_FIELD(vsk, write_notify_min_window); 44 45 } 46 } 47 notify_limit = vmci_trans(vsk)->consume_size - 48 PKT_FIELD(vsk, write_notify_window); 49 50 /* The notify_limit is used to delay notifications in the case where 51 * flow control is enabled. Below the test is expressed in terms of 52 * free space in the queue: if free_space > ConsumeSize - 53 * write_notify_window then notify An alternate way of expressing this 54 * is to rewrite the expression to use the data ready in the receive 55 * queue: if write_notify_window > bufferReady then notify as 56 * free_space == ConsumeSize - bufferReady. 57 */ 58 59 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > 60 notify_limit; 61 62 if (retval) { 63 /* Once we notify the peer, we reset the detected flag so the 64 * next wait will again cause a decrease in the window size. 65 */ 66 67 PKT_FIELD(vsk, peer_waiting_write_detected) = false; 68 } 69 return retval; 70 } 71 72 static void 73 vmci_transport_handle_read(struct sock *sk, 74 struct vmci_transport_packet *pkt, 75 bool bottom_half, 76 struct sockaddr_vm *dst, struct sockaddr_vm *src) 77 { 78 sk->sk_write_space(sk); 79 } 80 81 static void 82 vmci_transport_handle_wrote(struct sock *sk, 83 struct vmci_transport_packet *pkt, 84 bool bottom_half, 85 struct sockaddr_vm *dst, struct sockaddr_vm *src) 86 { 87 vsock_data_ready(sk); 88 } 89 90 static void vsock_block_update_write_window(struct sock *sk) 91 { 92 struct vsock_sock *vsk = vsock_sk(sk); 93 94 if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size) 95 PKT_FIELD(vsk, write_notify_window) = 96 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, 97 vmci_trans(vsk)->consume_size); 98 } 99 100 static int vmci_transport_send_read_notification(struct sock *sk) 101 { 102 struct vsock_sock *vsk; 103 bool sent_read; 104 unsigned int retries; 105 int err; 106 107 vsk = vsock_sk(sk); 108 sent_read = false; 109 retries = 0; 110 err = 0; 111 112 if (vmci_transport_notify_waiting_write(vsk)) { 113 /* Notify the peer that we have read, retrying the send on 114 * failure up to our maximum value. XXX For now we just log 115 * the failure, but later we should schedule a work item to 116 * handle the resend until it succeeds. That would require 117 * keeping track of work items in the vsk and cleaning them up 118 * upon socket close. 119 */ 120 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 121 !sent_read && 122 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 123 err = vmci_transport_send_read(sk); 124 if (err >= 0) 125 sent_read = true; 126 127 retries++; 128 } 129 130 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read) 131 pr_err("%p unable to send read notification to peer\n", 132 sk); 133 else 134 PKT_FIELD(vsk, peer_waiting_write) = false; 135 136 } 137 return err; 138 } 139 140 static void vmci_transport_notify_pkt_socket_init(struct sock *sk) 141 { 142 struct vsock_sock *vsk = vsock_sk(sk); 143 144 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 145 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 146 PKT_FIELD(vsk, peer_waiting_write) = false; 147 PKT_FIELD(vsk, peer_waiting_write_detected) = false; 148 } 149 150 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) 151 { 152 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 153 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 154 PKT_FIELD(vsk, peer_waiting_write) = false; 155 PKT_FIELD(vsk, peer_waiting_write_detected) = false; 156 } 157 158 static int 159 vmci_transport_notify_pkt_poll_in(struct sock *sk, 160 size_t target, bool *data_ready_now) 161 { 162 struct vsock_sock *vsk = vsock_sk(sk); 163 164 if (vsock_stream_has_data(vsk) >= target) { 165 *data_ready_now = true; 166 } else { 167 /* We can't read right now because there is not enough data 168 * in the queue. Ask for notifications when there is something 169 * to read. 170 */ 171 if (sk->sk_state == TCP_ESTABLISHED) 172 vsock_block_update_write_window(sk); 173 *data_ready_now = false; 174 } 175 176 return 0; 177 } 178 179 static int 180 vmci_transport_notify_pkt_poll_out(struct sock *sk, 181 size_t target, bool *space_avail_now) 182 { 183 s64 produce_q_free_space; 184 struct vsock_sock *vsk = vsock_sk(sk); 185 186 produce_q_free_space = vsock_stream_has_space(vsk); 187 if (produce_q_free_space > 0) { 188 *space_avail_now = true; 189 return 0; 190 } else if (produce_q_free_space == 0) { 191 /* This is a connected socket but we can't currently send data. 192 * Nothing else to do. 193 */ 194 *space_avail_now = false; 195 } 196 197 return 0; 198 } 199 200 static int 201 vmci_transport_notify_pkt_recv_init( 202 struct sock *sk, 203 size_t target, 204 struct vmci_transport_recv_notify_data *data) 205 { 206 struct vsock_sock *vsk = vsock_sk(sk); 207 208 data->consume_head = 0; 209 data->produce_tail = 0; 210 data->notify_on_block = false; 211 212 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { 213 PKT_FIELD(vsk, write_notify_min_window) = target + 1; 214 if (PKT_FIELD(vsk, write_notify_window) < 215 PKT_FIELD(vsk, write_notify_min_window)) { 216 /* If the current window is smaller than the new 217 * minimal window size, we need to reevaluate whether 218 * we need to notify the sender. If the number of ready 219 * bytes are smaller than the new window, we need to 220 * send a notification to the sender before we block. 221 */ 222 223 PKT_FIELD(vsk, write_notify_window) = 224 PKT_FIELD(vsk, write_notify_min_window); 225 data->notify_on_block = true; 226 } 227 } 228 229 return 0; 230 } 231 232 static int 233 vmci_transport_notify_pkt_recv_pre_block( 234 struct sock *sk, 235 size_t target, 236 struct vmci_transport_recv_notify_data *data) 237 { 238 int err = 0; 239 240 vsock_block_update_write_window(sk); 241 242 if (data->notify_on_block) { 243 err = vmci_transport_send_read_notification(sk); 244 if (err < 0) 245 return err; 246 data->notify_on_block = false; 247 } 248 249 return err; 250 } 251 252 static int 253 vmci_transport_notify_pkt_recv_post_dequeue( 254 struct sock *sk, 255 size_t target, 256 ssize_t copied, 257 bool data_read, 258 struct vmci_transport_recv_notify_data *data) 259 { 260 struct vsock_sock *vsk; 261 int err; 262 bool was_full = false; 263 u64 free_space; 264 265 vsk = vsock_sk(sk); 266 err = 0; 267 268 if (data_read) { 269 smp_mb(); 270 271 free_space = 272 vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair); 273 was_full = free_space == copied; 274 275 if (was_full) 276 PKT_FIELD(vsk, peer_waiting_write) = true; 277 278 err = vmci_transport_send_read_notification(sk); 279 if (err < 0) 280 return err; 281 282 /* See the comment in 283 * vmci_transport_notify_pkt_send_post_enqueue(). 284 */ 285 vsock_data_ready(sk); 286 } 287 288 return err; 289 } 290 291 static int 292 vmci_transport_notify_pkt_send_init( 293 struct sock *sk, 294 struct vmci_transport_send_notify_data *data) 295 { 296 data->consume_head = 0; 297 data->produce_tail = 0; 298 299 return 0; 300 } 301 302 static int 303 vmci_transport_notify_pkt_send_post_enqueue( 304 struct sock *sk, 305 ssize_t written, 306 struct vmci_transport_send_notify_data *data) 307 { 308 int err = 0; 309 struct vsock_sock *vsk; 310 bool sent_wrote = false; 311 bool was_empty; 312 int retries = 0; 313 314 vsk = vsock_sk(sk); 315 316 smp_mb(); 317 318 was_empty = 319 vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written; 320 if (was_empty) { 321 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 322 !sent_wrote && 323 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 324 err = vmci_transport_send_wrote(sk); 325 if (err >= 0) 326 sent_wrote = true; 327 328 retries++; 329 } 330 } 331 332 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) { 333 pr_err("%p unable to send wrote notification to peer\n", 334 sk); 335 return err; 336 } 337 338 return err; 339 } 340 341 static void 342 vmci_transport_notify_pkt_handle_pkt( 343 struct sock *sk, 344 struct vmci_transport_packet *pkt, 345 bool bottom_half, 346 struct sockaddr_vm *dst, 347 struct sockaddr_vm *src, bool *pkt_processed) 348 { 349 bool processed = false; 350 351 switch (pkt->type) { 352 case VMCI_TRANSPORT_PACKET_TYPE_WROTE: 353 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); 354 processed = true; 355 break; 356 case VMCI_TRANSPORT_PACKET_TYPE_READ: 357 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); 358 processed = true; 359 break; 360 } 361 362 if (pkt_processed) 363 *pkt_processed = processed; 364 } 365 366 static void vmci_transport_notify_pkt_process_request(struct sock *sk) 367 { 368 struct vsock_sock *vsk = vsock_sk(sk); 369 370 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 371 if (vmci_trans(vsk)->consume_size < 372 PKT_FIELD(vsk, write_notify_min_window)) 373 PKT_FIELD(vsk, write_notify_min_window) = 374 vmci_trans(vsk)->consume_size; 375 } 376 377 static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) 378 { 379 struct vsock_sock *vsk = vsock_sk(sk); 380 381 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 382 if (vmci_trans(vsk)->consume_size < 383 PKT_FIELD(vsk, write_notify_min_window)) 384 PKT_FIELD(vsk, write_notify_min_window) = 385 vmci_trans(vsk)->consume_size; 386 } 387 388 static int 389 vmci_transport_notify_pkt_recv_pre_dequeue( 390 struct sock *sk, 391 size_t target, 392 struct vmci_transport_recv_notify_data *data) 393 { 394 return 0; /* NOP for QState. */ 395 } 396 397 static int 398 vmci_transport_notify_pkt_send_pre_block( 399 struct sock *sk, 400 struct vmci_transport_send_notify_data *data) 401 { 402 return 0; /* NOP for QState. */ 403 } 404 405 static int 406 vmci_transport_notify_pkt_send_pre_enqueue( 407 struct sock *sk, 408 struct vmci_transport_send_notify_data *data) 409 { 410 return 0; /* NOP for QState. */ 411 } 412 413 /* Socket always on control packet based operations. */ 414 const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = { 415 .socket_init = vmci_transport_notify_pkt_socket_init, 416 .socket_destruct = vmci_transport_notify_pkt_socket_destruct, 417 .poll_in = vmci_transport_notify_pkt_poll_in, 418 .poll_out = vmci_transport_notify_pkt_poll_out, 419 .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt, 420 .recv_init = vmci_transport_notify_pkt_recv_init, 421 .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block, 422 .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue, 423 .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue, 424 .send_init = vmci_transport_notify_pkt_send_init, 425 .send_pre_block = vmci_transport_notify_pkt_send_pre_block, 426 .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue, 427 .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue, 428 .process_request = vmci_transport_notify_pkt_process_request, 429 .process_negotiate = vmci_transport_notify_pkt_process_negotiate, 430 }; 431
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.