1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem read subrequest result collection, assessment and 3 * retrying. 4 * 5 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. 6 * Written by David Howells (dhowells@redhat.com) 7 */ 8 9 #include <linux/export.h> 10 #include <linux/fs.h> 11 #include <linux/mm.h> 12 #include <linux/pagemap.h> 13 #include <linux/slab.h> 14 #include <linux/task_io_accounting_ops.h> 15 #include "internal.h" 16 17 /* 18 * Clear the unread part of an I/O request. 19 */ 20 static void netfs_clear_unread(struct netfs_io_subrequest *subreq) 21 { 22 netfs_reset_iter(subreq); 23 WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter)); 24 iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); 25 if (subreq->start + subreq->transferred >= subreq->rreq->i_size) 26 __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); 27 } 28 29 /* 30 * Flush, mark and unlock a folio that's now completely read. If we want to 31 * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it 32 * dirty and let writeback handle it. 33 */ 34 static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, 35 struct netfs_io_request *rreq, 36 struct folio_queue *folioq, 37 int slot) 38 { 39 struct netfs_folio *finfo; 40 struct folio *folio = folioq_folio(folioq, slot); 41 42 flush_dcache_folio(folio); 43 folio_mark_uptodate(folio); 44 45 if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { 46 finfo = netfs_folio_info(folio); 47 if (finfo) { 48 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); 49 if (finfo->netfs_group) 50 folio_change_private(folio, finfo->netfs_group); 51 else 52 folio_detach_private(folio); 53 kfree(finfo); 54 } 55 56 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { 57 if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { 58 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); 59 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); 60 folio_mark_dirty(folio); 61 } 62 } else { 63 trace_netfs_folio(folio, netfs_folio_trace_read_done); 64 } 65 } else { 66 // TODO: Use of PG_private_2 is deprecated. 67 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) 68 netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); 69 } 70 71 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { 72 if (folio->index == rreq->no_unlock_folio && 73 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { 74 _debug("no unlock"); 75 } else { 76 trace_netfs_folio(folio, netfs_folio_trace_read_unlock); 77 folio_unlock(folio); 78 } 79 } 80 81 folioq_clear(folioq, slot); 82 } 83 84 /* 85 * Unlock any folios that are now completely read. Returns true if the 86 * subrequest is removed from the list. 87 */ 88 static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) 89 { 90 struct netfs_io_subrequest *prev, *next; 91 struct netfs_io_request *rreq = subreq->rreq; 92 struct folio_queue *folioq = subreq->curr_folioq; 93 size_t avail, prev_donated, next_donated, fsize, part, excess; 94 loff_t fpos, start; 95 loff_t fend; 96 int slot = subreq->curr_folioq_slot; 97 98 if (WARN(subreq->transferred > subreq->len, 99 "Subreq overread: R%x[%x] %zu > %zu", 100 rreq->debug_id, subreq->debug_index, 101 subreq->transferred, subreq->len)) 102 subreq->transferred = subreq->len; 103 104 next_folio: 105 fsize = PAGE_SIZE << subreq->curr_folio_order; 106 fpos = round_down(subreq->start + subreq->consumed, fsize); 107 fend = fpos + fsize; 108 109 if (WARN_ON_ONCE(!folioq) || 110 WARN_ON_ONCE(!folioq_folio(folioq, slot)) || 111 WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { 112 pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", 113 rreq->debug_id, subreq->debug_index, 114 subreq->start, subreq->start + subreq->transferred - 1, 115 subreq->consumed, subreq->transferred, subreq->len, 116 slot); 117 if (folioq) { 118 struct folio *folio = folioq_folio(folioq, slot); 119 120 pr_err("folioq: orders=%02x%02x%02x%02x\n", 121 folioq->orders[0], folioq->orders[1], 122 folioq->orders[2], folioq->orders[3]); 123 if (folio) 124 pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n", 125 fpos, fend - 1, folio_pos(folio), folio_order(folio), 126 folioq_folio_order(folioq, slot)); 127 } 128 } 129 130 donation_changed: 131 /* Try to consume the current folio if we've hit or passed the end of 132 * it. There's a possibility that this subreq doesn't start at the 133 * beginning of the folio, in which case we need to donate to/from the 134 * preceding subreq. 135 * 136 * We also need to include any potential donation back from the 137 * following subreq. 138 */ 139 prev_donated = READ_ONCE(subreq->prev_donated); 140 next_donated = READ_ONCE(subreq->next_donated); 141 if (prev_donated || next_donated) { 142 spin_lock_bh(&rreq->lock); 143 prev_donated = subreq->prev_donated; 144 next_donated = subreq->next_donated; 145 subreq->start -= prev_donated; 146 subreq->len += prev_donated; 147 subreq->transferred += prev_donated; 148 prev_donated = subreq->prev_donated = 0; 149 if (subreq->transferred == subreq->len) { 150 subreq->len += next_donated; 151 subreq->transferred += next_donated; 152 next_donated = subreq->next_donated = 0; 153 } 154 trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations); 155 spin_unlock_bh(&rreq->lock); 156 } 157 158 avail = subreq->transferred; 159 if (avail == subreq->len) 160 avail += next_donated; 161 start = subreq->start; 162 if (subreq->consumed == 0) { 163 start -= prev_donated; 164 avail += prev_donated; 165 } else { 166 start += subreq->consumed; 167 avail -= subreq->consumed; 168 } 169 part = umin(avail, fsize); 170 171 trace_netfs_progress(subreq, start, avail, part); 172 173 if (start + avail >= fend) { 174 if (fpos == start) { 175 /* Flush, unlock and mark for caching any folio we've just read. */ 176 subreq->consumed = fend - subreq->start; 177 netfs_unlock_read_folio(subreq, rreq, folioq, slot); 178 folioq_mark2(folioq, slot); 179 if (subreq->consumed >= subreq->len) 180 goto remove_subreq; 181 } else if (fpos < start) { 182 excess = fend - subreq->start; 183 184 spin_lock_bh(&rreq->lock); 185 /* If we complete first on a folio split with the 186 * preceding subreq, donate to that subreq - otherwise 187 * we get the responsibility. 188 */ 189 if (subreq->prev_donated != prev_donated) { 190 spin_unlock_bh(&rreq->lock); 191 goto donation_changed; 192 } 193 194 if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { 195 spin_unlock_bh(&rreq->lock); 196 pr_err("Can't donate prior to front\n"); 197 goto bad; 198 } 199 200 prev = list_prev_entry(subreq, rreq_link); 201 WRITE_ONCE(prev->next_donated, prev->next_donated + excess); 202 subreq->start += excess; 203 subreq->len -= excess; 204 subreq->transferred -= excess; 205 trace_netfs_donate(rreq, subreq, prev, excess, 206 netfs_trace_donate_tail_to_prev); 207 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); 208 209 if (subreq->consumed >= subreq->len) 210 goto remove_subreq_locked; 211 spin_unlock_bh(&rreq->lock); 212 } else { 213 pr_err("fpos > start\n"); 214 goto bad; 215 } 216 217 /* Advance the rolling buffer to the next folio. */ 218 slot++; 219 if (slot >= folioq_nr_slots(folioq)) { 220 slot = 0; 221 folioq = folioq->next; 222 subreq->curr_folioq = folioq; 223 } 224 subreq->curr_folioq_slot = slot; 225 if (folioq && folioq_folio(folioq, slot)) 226 subreq->curr_folio_order = folioq->orders[slot]; 227 if (!was_async) 228 cond_resched(); 229 goto next_folio; 230 } 231 232 /* Deal with partial progress. */ 233 if (subreq->transferred < subreq->len) 234 return false; 235 236 /* Donate the remaining downloaded data to one of the neighbouring 237 * subrequests. Note that we may race with them doing the same thing. 238 */ 239 spin_lock_bh(&rreq->lock); 240 241 if (subreq->prev_donated != prev_donated || 242 subreq->next_donated != next_donated) { 243 spin_unlock_bh(&rreq->lock); 244 cond_resched(); 245 goto donation_changed; 246 } 247 248 /* Deal with the trickiest case: that this subreq is in the middle of a 249 * folio, not touching either edge, but finishes first. In such a 250 * case, we donate to the previous subreq, if there is one, so that the 251 * donation is only handled when that completes - and remove this 252 * subreq from the list. 253 * 254 * If the previous subreq finished first, we will have acquired their 255 * donation and should be able to unlock folios and/or donate nextwards. 256 */ 257 if (!subreq->consumed && 258 !prev_donated && 259 !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { 260 prev = list_prev_entry(subreq, rreq_link); 261 WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); 262 subreq->start += subreq->len; 263 subreq->len = 0; 264 subreq->transferred = 0; 265 trace_netfs_donate(rreq, subreq, prev, subreq->len, 266 netfs_trace_donate_to_prev); 267 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); 268 goto remove_subreq_locked; 269 } 270 271 /* If we can't donate down the chain, donate up the chain instead. */ 272 excess = subreq->len - subreq->consumed + next_donated; 273 274 if (!subreq->consumed) 275 excess += prev_donated; 276 277 if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) { 278 rreq->prev_donated = excess; 279 trace_netfs_donate(rreq, subreq, NULL, excess, 280 netfs_trace_donate_to_deferred_next); 281 } else { 282 next = list_next_entry(subreq, rreq_link); 283 WRITE_ONCE(next->prev_donated, excess); 284 trace_netfs_donate(rreq, subreq, next, excess, 285 netfs_trace_donate_to_next); 286 } 287 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next); 288 subreq->len = subreq->consumed; 289 subreq->transferred = subreq->consumed; 290 goto remove_subreq_locked; 291 292 remove_subreq: 293 spin_lock_bh(&rreq->lock); 294 remove_subreq_locked: 295 subreq->consumed = subreq->len; 296 list_del(&subreq->rreq_link); 297 spin_unlock_bh(&rreq->lock); 298 netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed); 299 return true; 300 301 bad: 302 /* Errr... prev and next both donated to us, but insufficient to finish 303 * the folio. 304 */ 305 printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n", 306 rreq->debug_id, subreq->debug_index, 307 subreq->start, subreq->start + subreq->transferred - 1, 308 subreq->consumed, subreq->transferred, subreq->len); 309 printk("folio: %llx-%llx\n", fpos, fend - 1); 310 printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated); 311 printk("s=%llx av=%zx part=%zx\n", start, avail, part); 312 BUG(); 313 } 314 315 /* 316 * Do page flushing and suchlike after DIO. 317 */ 318 static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) 319 { 320 struct netfs_io_subrequest *subreq; 321 unsigned int i; 322 323 /* Collect unbuffered reads and direct reads, adding up the transfer 324 * sizes until we find the first short or failed subrequest. 325 */ 326 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { 327 rreq->transferred += subreq->transferred; 328 329 if (subreq->transferred < subreq->len || 330 test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { 331 rreq->error = subreq->error; 332 break; 333 } 334 } 335 336 if (rreq->origin == NETFS_DIO_READ) { 337 for (i = 0; i < rreq->direct_bv_count; i++) { 338 flush_dcache_page(rreq->direct_bv[i].bv_page); 339 // TODO: cifs marks pages in the destination buffer 340 // dirty under some circumstances after a read. Do we 341 // need to do that too? 342 set_page_dirty(rreq->direct_bv[i].bv_page); 343 } 344 } 345 346 if (rreq->iocb) { 347 rreq->iocb->ki_pos += rreq->transferred; 348 if (rreq->iocb->ki_complete) 349 rreq->iocb->ki_complete( 350 rreq->iocb, rreq->error ? rreq->error : rreq->transferred); 351 } 352 if (rreq->netfs_ops->done) 353 rreq->netfs_ops->done(rreq); 354 if (rreq->origin == NETFS_DIO_READ) 355 inode_dio_end(rreq->inode); 356 } 357 358 /* 359 * Assess the state of a read request and decide what to do next. 360 * 361 * Note that we're in normal kernel thread context at this point, possibly 362 * running on a workqueue. 363 */ 364 static void netfs_rreq_assess(struct netfs_io_request *rreq) 365 { 366 trace_netfs_rreq(rreq, netfs_rreq_trace_assess); 367 368 //netfs_rreq_is_still_valid(rreq); 369 370 if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { 371 netfs_retry_reads(rreq); 372 return; 373 } 374 375 if (rreq->origin == NETFS_DIO_READ || 376 rreq->origin == NETFS_READ_GAPS) 377 netfs_rreq_assess_dio(rreq); 378 task_io_account_read(rreq->transferred); 379 380 trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); 381 clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); 382 wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); 383 384 trace_netfs_rreq(rreq, netfs_rreq_trace_done); 385 netfs_clear_subrequests(rreq, false); 386 netfs_unlock_abandoned_read_pages(rreq); 387 if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))) 388 netfs_pgpriv2_write_to_the_cache(rreq); 389 } 390 391 void netfs_read_termination_worker(struct work_struct *work) 392 { 393 struct netfs_io_request *rreq = 394 container_of(work, struct netfs_io_request, work); 395 netfs_see_request(rreq, netfs_rreq_trace_see_work); 396 netfs_rreq_assess(rreq); 397 netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); 398 } 399 400 /* 401 * Handle the completion of all outstanding I/O operations on a read request. 402 * We inherit a ref from the caller. 403 */ 404 void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) 405 { 406 if (!was_async) 407 return netfs_rreq_assess(rreq); 408 if (!work_pending(&rreq->work)) { 409 netfs_get_request(rreq, netfs_rreq_trace_get_work); 410 if (!queue_work(system_unbound_wq, &rreq->work)) 411 netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); 412 } 413 } 414 415 /** 416 * netfs_read_subreq_progress - Note progress of a read operation. 417 * @subreq: The read request that has terminated. 418 * @was_async: True if we're in an asynchronous context. 419 * 420 * This tells the read side of netfs lib that a contributory I/O operation has 421 * made some progress and that it may be possible to unlock some folios. 422 * 423 * Before calling, the filesystem should update subreq->transferred to track 424 * the amount of data copied into the output buffer. 425 * 426 * If @was_async is true, the caller might be running in softirq or interrupt 427 * context and we can't sleep. 428 */ 429 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, 430 bool was_async) 431 { 432 struct netfs_io_request *rreq = subreq->rreq; 433 434 trace_netfs_sreq(subreq, netfs_sreq_trace_progress); 435 436 if (subreq->transferred > subreq->consumed && 437 (rreq->origin == NETFS_READAHEAD || 438 rreq->origin == NETFS_READPAGE || 439 rreq->origin == NETFS_READ_FOR_WRITE)) { 440 netfs_consume_read_data(subreq, was_async); 441 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); 442 } 443 } 444 EXPORT_SYMBOL(netfs_read_subreq_progress); 445 446 /** 447 * netfs_read_subreq_terminated - Note the termination of an I/O operation. 448 * @subreq: The I/O request that has terminated. 449 * @error: Error code indicating type of completion. 450 * @was_async: The termination was asynchronous 451 * 452 * This tells the read helper that a contributory I/O operation has terminated, 453 * one way or another, and that it should integrate the results. 454 * 455 * The caller indicates the outcome of the operation through @error, supplying 456 * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY 457 * is set) or a negative error code. The helper will look after reissuing I/O 458 * operations as appropriate and writing downloaded data to the cache. 459 * 460 * Before calling, the filesystem should update subreq->transferred to track 461 * the amount of data copied into the output buffer. 462 * 463 * If @was_async is true, the caller might be running in softirq or interrupt 464 * context and we can't sleep. 465 */ 466 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, 467 int error, bool was_async) 468 { 469 struct netfs_io_request *rreq = subreq->rreq; 470 471 switch (subreq->source) { 472 case NETFS_READ_FROM_CACHE: 473 netfs_stat(&netfs_n_rh_read_done); 474 break; 475 case NETFS_DOWNLOAD_FROM_SERVER: 476 netfs_stat(&netfs_n_rh_download_done); 477 break; 478 default: 479 break; 480 } 481 482 if (rreq->origin != NETFS_DIO_READ) { 483 /* Collect buffered reads. 484 * 485 * If the read completed validly short, then we can clear the 486 * tail before going on to unlock the folios. 487 */ 488 if (error == 0 && subreq->transferred < subreq->len && 489 (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) || 490 test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) { 491 netfs_clear_unread(subreq); 492 subreq->transferred = subreq->len; 493 trace_netfs_sreq(subreq, netfs_sreq_trace_clear); 494 } 495 if (subreq->transferred > subreq->consumed && 496 (rreq->origin == NETFS_READAHEAD || 497 rreq->origin == NETFS_READPAGE || 498 rreq->origin == NETFS_READ_FOR_WRITE)) { 499 netfs_consume_read_data(subreq, was_async); 500 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); 501 } 502 rreq->transferred += subreq->transferred; 503 } 504 505 /* Deal with retry requests, short reads and errors. If we retry 506 * but don't make progress, we abandon the attempt. 507 */ 508 if (!error && subreq->transferred < subreq->len) { 509 if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { 510 trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); 511 } else { 512 trace_netfs_sreq(subreq, netfs_sreq_trace_short); 513 if (subreq->transferred > subreq->consumed) { 514 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); 515 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); 516 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); 517 } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { 518 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); 519 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); 520 } else { 521 __set_bit(NETFS_SREQ_FAILED, &subreq->flags); 522 error = -ENODATA; 523 } 524 } 525 } 526 527 subreq->error = error; 528 trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); 529 530 if (unlikely(error < 0)) { 531 trace_netfs_failure(rreq, subreq, error, netfs_fail_read); 532 if (subreq->source == NETFS_READ_FROM_CACHE) { 533 netfs_stat(&netfs_n_rh_read_failed); 534 } else { 535 netfs_stat(&netfs_n_rh_download_failed); 536 set_bit(NETFS_RREQ_FAILED, &rreq->flags); 537 rreq->error = subreq->error; 538 } 539 } 540 541 if (atomic_dec_and_test(&rreq->nr_outstanding)) 542 netfs_rreq_terminated(rreq, was_async); 543 544 netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); 545 } 546 EXPORT_SYMBOL(netfs_read_subreq_terminated); 547
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.