1 // SPDX-License-Identifier: GPL-2.0 1 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bi 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages int 38 * If a page does not map to a contiguous run 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depend 42 * which can complete in any order (or at the 43 * status of that page is hard. See end_buffe 44 * There is no point in duplicating all that c 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_ 50 51 bio_for_each_folio_all(fi, bio) 52 folio_end_read(fi.folio, err = 53 54 bio_put(bio); 55 } 56 57 static void mpage_write_end_io(struct bio *bio 58 { 59 struct folio_iter fi; 60 int err = blk_status_to_errno(bio->bi_ 61 62 bio_for_each_folio_all(fi, bio) { 63 if (err) 64 mapping_set_error(fi.f 65 folio_end_writeback(fi.folio); 66 } 67 68 bio_put(bio); 69 } 70 71 static struct bio *mpage_bio_submit_read(struc 72 { 73 bio->bi_end_io = mpage_read_end_io; 74 guard_bio_eod(bio); 75 submit_bio(bio); 76 return NULL; 77 } 78 79 static struct bio *mpage_bio_submit_write(stru 80 { 81 bio->bi_end_io = mpage_write_end_io; 82 guard_bio_eod(bio); 83 submit_bio(bio); 84 return NULL; 85 } 86 87 /* 88 * support function for mpage_readahead. The 89 * return an up to date buffer. This is used 90 * the page, which allows read_folio to avoid 91 * to get_block. 92 * 93 * The idea is to avoid adding buffers to page 94 * them. So when the buffer is up to date and 95 * this marks the page up to date instead of a 96 */ 97 static void map_buffer_to_folio(struct folio * 98 int page_block) 99 { 100 struct inode *inode = folio->mapping-> 101 struct buffer_head *page_bh, *head; 102 int block = 0; 103 104 head = folio_buffers(folio); 105 if (!head) { 106 /* 107 * don't make any buffers if t 108 * the folio and the folio jus 109 */ 110 if (inode->i_blkbits == PAGE_S 111 buffer_uptodate(bh)) { 112 folio_mark_uptodate(fo 113 return; 114 } 115 head = create_empty_buffers(fo 116 } 117 118 page_bh = head; 119 do { 120 if (block == page_block) { 121 page_bh->b_state = bh- 122 page_bh->b_bdev = bh-> 123 page_bh->b_blocknr = b 124 break; 125 } 126 page_bh = page_bh->b_this_page 127 block++; 128 } while (page_bh != head); 129 } 130 131 struct mpage_readpage_args { 132 struct bio *bio; 133 struct folio *folio; 134 unsigned int nr_pages; 135 bool is_readahead; 136 sector_t last_block_in_bio; 137 struct buffer_head map_bh; 138 unsigned long first_logical_block; 139 get_block_t *get_block; 140 }; 141 142 /* 143 * This is the worker routine which does all t 144 * blocks and constructs largest possible bios 145 * blocks are not contiguous on the disk. 146 * 147 * We pass a buffer_head back and forth and us 148 * represent the validity of its disk mapping 149 * get_block() call. 150 */ 151 static struct bio *do_mpage_readpage(struct mp 152 { 153 struct folio *folio = args->folio; 154 struct inode *inode = folio->mapping-> 155 const unsigned blkbits = inode->i_blkb 156 const unsigned blocks_per_page = PAGE_ 157 const unsigned blocksize = 1 << blkbit 158 struct buffer_head *map_bh = &args->ma 159 sector_t block_in_file; 160 sector_t last_block; 161 sector_t last_block_in_file; 162 sector_t first_block; 163 unsigned page_block; 164 unsigned first_hole = blocks_per_page; 165 struct block_device *bdev = NULL; 166 int length; 167 int fully_mapped = 1; 168 blk_opf_t opf = REQ_OP_READ; 169 unsigned nblocks; 170 unsigned relative_block; 171 gfp_t gfp = mapping_gfp_constraint(fol 172 173 /* MAX_BUF_PER_PAGE, for example */ 174 VM_BUG_ON_FOLIO(folio_test_large(folio 175 176 if (args->is_readahead) { 177 opf |= REQ_RAHEAD; 178 gfp |= __GFP_NORETRY | __GFP_N 179 } 180 181 if (folio_buffers(folio)) 182 goto confused; 183 184 block_in_file = (sector_t)folio->index 185 last_block = block_in_file + args->nr_ 186 last_block_in_file = (i_size_read(inod 187 if (last_block > last_block_in_file) 188 last_block = last_block_in_fil 189 page_block = 0; 190 191 /* 192 * Map blocks using the result from th 193 */ 194 nblocks = map_bh->b_size >> blkbits; 195 if (buffer_mapped(map_bh) && 196 block_in_file > args-> 197 block_in_file < (args- 198 unsigned map_offset = block_in 199 unsigned last = nblocks - map_ 200 201 first_block = map_bh->b_blockn 202 for (relative_block = 0; ; rel 203 if (relative_block == 204 clear_buffer_m 205 break; 206 } 207 if (page_block == bloc 208 break; 209 page_block++; 210 block_in_file++; 211 } 212 bdev = map_bh->b_bdev; 213 } 214 215 /* 216 * Then do more get_blocks calls until 217 */ 218 map_bh->b_folio = folio; 219 while (page_block < blocks_per_page) { 220 map_bh->b_state = 0; 221 map_bh->b_size = 0; 222 223 if (block_in_file < last_block 224 map_bh->b_size = (last 225 if (args->get_block(in 226 goto confused; 227 args->first_logical_bl 228 } 229 230 if (!buffer_mapped(map_bh)) { 231 fully_mapped = 0; 232 if (first_hole == bloc 233 first_hole = p 234 page_block++; 235 block_in_file++; 236 continue; 237 } 238 239 /* some filesystems will copy 240 * the get_block call, in whic 241 * read it again. map_buffer_ 242 * we just collected from get_ 243 * so read_folio doesn't have 244 */ 245 if (buffer_uptodate(map_bh)) { 246 map_buffer_to_folio(fo 247 goto confused; 248 } 249 250 if (first_hole != blocks_per_p 251 goto confused; 252 253 /* Contiguous blocks? */ 254 if (!page_block) 255 first_block = map_bh-> 256 else if (first_block + page_bl 257 goto confused; 258 nblocks = map_bh->b_size >> bl 259 for (relative_block = 0; ; rel 260 if (relative_block == 261 clear_buffer_m 262 break; 263 } else if (page_block 264 break; 265 page_block++; 266 block_in_file++; 267 } 268 bdev = map_bh->b_bdev; 269 } 270 271 if (first_hole != blocks_per_page) { 272 folio_zero_segment(folio, firs 273 if (first_hole == 0) { 274 folio_mark_uptodate(fo 275 folio_unlock(folio); 276 goto out; 277 } 278 } else if (fully_mapped) { 279 folio_set_mappedtodisk(folio); 280 } 281 282 /* 283 * This folio will go to BIO. Do we n 284 */ 285 if (args->bio && (args->last_block_in_ 286 args->bio = mpage_bio_submit_r 287 288 alloc_new: 289 if (args->bio == NULL) { 290 args->bio = bio_alloc(bdev, bi 291 gfp); 292 if (args->bio == NULL) 293 goto confused; 294 args->bio->bi_iter.bi_sector = 295 } 296 297 length = first_hole << blkbits; 298 if (!bio_add_folio(args->bio, folio, l 299 args->bio = mpage_bio_submit_r 300 goto alloc_new; 301 } 302 303 relative_block = block_in_file - args- 304 nblocks = map_bh->b_size >> blkbits; 305 if ((buffer_boundary(map_bh) && relati 306 (first_hole != blocks_per_page)) 307 args->bio = mpage_bio_submit_r 308 else 309 args->last_block_in_bio = firs 310 out: 311 return args->bio; 312 313 confused: 314 if (args->bio) 315 args->bio = mpage_bio_submit_r 316 if (!folio_test_uptodate(folio)) 317 block_read_full_folio(folio, a 318 else 319 folio_unlock(folio); 320 goto out; 321 } 322 323 /** 324 * mpage_readahead - start reads against pages 325 * @rac: Describes which pages to read. 326 * @get_block: The filesystem's block mapper f 327 * 328 * This function walks the pages and the block 329 * emitting large BIOs. 330 * 331 * If anything unusual happens, such as: 332 * 333 * - encountering a page which has buffers 334 * - encountering a page which has a non-hole 335 * - encountering a page with non-contiguous b 336 * 337 * then this code just gives up and calls the 338 * It does handle a page which has holes at th 339 * the end-of-file on blocksize < PAGE_SIZE se 340 * 341 * BH_Boundary explanation: 342 * 343 * There is a problem. The mpage read code as 344 * their disk mappings, and then submits them 345 * the disk mappings may require I/O. Reads o 346 * 347 * So an mpage read of the first 16 blocks of 348 * submitted in the following order: 349 * 350 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 351 * 352 * because the indirect block has to be read t 353 * 13,14,15,16. Obviously, this impacts perfo 354 * 355 * So what we do it to allow the filesystem's 356 * BH_Boundary when it maps block 11. BH_Boun 357 * after this one will require I/O against a b 358 * this one. So you should push what I/O you 359 * 360 * This all causes the disk requests to be iss 361 */ 362 void mpage_readahead(struct readahead_control 363 { 364 struct folio *folio; 365 struct mpage_readpage_args args = { 366 .get_block = get_block, 367 .is_readahead = true, 368 }; 369 370 while ((folio = readahead_folio(rac))) 371 prefetchw(&folio->flags); 372 args.folio = folio; 373 args.nr_pages = readahead_coun 374 args.bio = do_mpage_readpage(& 375 } 376 if (args.bio) 377 mpage_bio_submit_read(args.bio 378 } 379 EXPORT_SYMBOL(mpage_readahead); 380 381 /* 382 * This isn't called much at all 383 */ 384 int mpage_read_folio(struct folio *folio, get_ 385 { 386 struct mpage_readpage_args args = { 387 .folio = folio, 388 .nr_pages = 1, 389 .get_block = get_block, 390 }; 391 392 args.bio = do_mpage_readpage(&args); 393 if (args.bio) 394 mpage_bio_submit_read(args.bio 395 return 0; 396 } 397 EXPORT_SYMBOL(mpage_read_folio); 398 399 /* 400 * Writing is not so simple. 401 * 402 * If the page has buffers then they will be u 403 * mapping. We only support pages which are f 404 * special case for pages which are unmapped a 405 * 406 * If the page has no buffers (preferred) then 407 * 408 * If all blocks are found to be contiguous th 409 * BIO. Otherwise fall back to the mapping's 410 * 411 * FIXME: This code wants an estimate of how m 412 * written, so it can intelligently allocate a 413 * just allocate full-size (16-page) BIOs. 414 */ 415 416 struct mpage_data { 417 struct bio *bio; 418 sector_t last_block_in_bio; 419 get_block_t *get_block; 420 }; 421 422 /* 423 * We have our BIO, so we can now mark the buf 424 * sure to only clean buffers which we know we 425 */ 426 static void clean_buffers(struct folio *folio, 427 { 428 unsigned buffer_counter = 0; 429 struct buffer_head *bh, *head = folio_ 430 431 if (!head) 432 return; 433 bh = head; 434 435 do { 436 if (buffer_counter++ == first_ 437 break; 438 clear_buffer_dirty(bh); 439 bh = bh->b_this_page; 440 } while (bh != head); 441 442 /* 443 * we cannot drop the bh if the page i 444 * read_folio would fail to serialize 445 * disk before we reach the platter. 446 */ 447 if (buffer_heads_over_limit && folio_t 448 try_to_free_buffers(folio); 449 } 450 451 static int __mpage_writepage(struct folio *fol 452 void *data) 453 { 454 struct mpage_data *mpd = data; 455 struct bio *bio = mpd->bio; 456 struct address_space *mapping = folio- 457 struct inode *inode = mapping->host; 458 const unsigned blkbits = inode->i_blkb 459 const unsigned blocks_per_page = PAGE_ 460 sector_t last_block; 461 sector_t block_in_file; 462 sector_t first_block; 463 unsigned page_block; 464 unsigned first_unmapped = blocks_per_p 465 struct block_device *bdev = NULL; 466 int boundary = 0; 467 sector_t boundary_block = 0; 468 struct block_device *boundary_bdev = N 469 size_t length; 470 struct buffer_head map_bh; 471 loff_t i_size = i_size_read(inode); 472 int ret = 0; 473 struct buffer_head *head = folio_buffe 474 475 if (head) { 476 struct buffer_head *bh = head; 477 478 /* If they're all mapped and d 479 page_block = 0; 480 do { 481 BUG_ON(buffer_locked(b 482 if (!buffer_mapped(bh) 483 /* 484 * unmapped di 485 * block_dirty 486 */ 487 if (buffer_dir 488 goto c 489 if (first_unma 490 first_ 491 continue; 492 } 493 494 if (first_unmapped != 495 goto confused; 496 497 if (!buffer_dirty(bh) 498 goto confused; 499 if (page_block) { 500 if (bh->b_bloc 501 goto c 502 } else { 503 first_block = 504 } 505 page_block++; 506 boundary = buffer_boun 507 if (boundary) { 508 boundary_block 509 boundary_bdev 510 } 511 bdev = bh->b_bdev; 512 } while ((bh = bh->b_this_page 513 514 if (first_unmapped) 515 goto page_is_mapped; 516 517 /* 518 * Page has buffers, but they 519 * created by pagein or read o 520 * block_read_full_folio(). I 521 * using mpage_readahead then 522 */ 523 goto confused; 524 } 525 526 /* 527 * The page has no buffers: map it to 528 */ 529 BUG_ON(!folio_test_uptodate(folio)); 530 block_in_file = (sector_t)folio->index 531 /* 532 * Whole page beyond EOF? Skip allocat 533 * space. 534 */ 535 if (block_in_file >= (i_size + (1 << b 536 goto page_is_mapped; 537 last_block = (i_size - 1) >> blkbits; 538 map_bh.b_folio = folio; 539 for (page_block = 0; page_block < bloc 540 541 map_bh.b_state = 0; 542 map_bh.b_size = 1 << blkbits; 543 if (mpd->get_block(inode, bloc 544 goto confused; 545 if (!buffer_mapped(&map_bh)) 546 goto confused; 547 if (buffer_new(&map_bh)) 548 clean_bdev_bh_alias(&m 549 if (buffer_boundary(&map_bh)) 550 boundary_block = map_b 551 boundary_bdev = map_bh 552 } 553 if (page_block) { 554 if (map_bh.b_blocknr ! 555 goto confused; 556 } else { 557 first_block = map_bh.b 558 } 559 page_block++; 560 boundary = buffer_boundary(&ma 561 bdev = map_bh.b_bdev; 562 if (block_in_file == last_bloc 563 break; 564 block_in_file++; 565 } 566 BUG_ON(page_block == 0); 567 568 first_unmapped = page_block; 569 570 page_is_mapped: 571 /* Don't bother writing beyond EOF, tr 572 if (folio_pos(folio) >= i_size) 573 goto confused; 574 length = folio_size(folio); 575 if (folio_pos(folio) + length > i_size 576 /* 577 * The page straddles i_size. 578 * and every writepage invocat 579 * "A file is mapped in multip 580 * that is not a multiple of t 581 * is zeroed when mapped, and 582 * written out to the file." 583 */ 584 length = i_size - folio_pos(fo 585 folio_zero_segment(folio, leng 586 } 587 588 /* 589 * This page will go to BIO. Do we ne 590 */ 591 if (bio && mpd->last_block_in_bio != f 592 bio = mpage_bio_submit_write(b 593 594 alloc_new: 595 if (bio == NULL) { 596 bio = bio_alloc(bdev, BIO_MAX_ 597 REQ_OP_WRITE | 598 GFP_NOFS); 599 bio->bi_iter.bi_sector = first 600 wbc_init_bio(wbc, bio); 601 bio->bi_write_hint = inode->i_ 602 } 603 604 /* 605 * Must try to add the page before mar 606 * the confused fail path above (OOM) 607 * it finds all bh marked clean (i.e. 608 */ 609 wbc_account_cgroup_owner(wbc, &folio-> 610 length = first_unmapped << blkbits; 611 if (!bio_add_folio(bio, folio, length, 612 bio = mpage_bio_submit_write(b 613 goto alloc_new; 614 } 615 616 clean_buffers(folio, first_unmapped); 617 618 BUG_ON(folio_test_writeback(folio)); 619 folio_start_writeback(folio); 620 folio_unlock(folio); 621 if (boundary || (first_unmapped != blo 622 bio = mpage_bio_submit_write(b 623 if (boundary_block) { 624 write_boundary_block(b 625 bounda 626 } 627 } else { 628 mpd->last_block_in_bio = first 629 } 630 goto out; 631 632 confused: 633 if (bio) 634 bio = mpage_bio_submit_write(b 635 636 /* 637 * The caller has a ref on the inode, 638 */ 639 ret = block_write_full_folio(folio, wb 640 mapping_set_error(mapping, ret); 641 out: 642 mpd->bio = bio; 643 return ret; 644 } 645 646 /** 647 * mpage_writepages - walk the list of dirty p 648 * @mapping: address space structure to write 649 * @wbc: subtract the number of written pages 650 * @get_block: the filesystem's block mapper f 651 * 652 * This is a library function, which implement 653 * address_space_operation. 654 */ 655 int 656 mpage_writepages(struct address_space *mapping 657 struct writeback_control *wbc, 658 { 659 struct mpage_data mpd = { 660 .get_block = get_block, 661 }; 662 struct blk_plug plug; 663 int ret; 664 665 blk_start_plug(&plug); 666 ret = write_cache_pages(mapping, wbc, 667 if (mpd.bio) 668 mpage_bio_submit_write(mpd.bio 669 blk_finish_plug(&plug); 670 return ret; 671 } 672 EXPORT_SYMBOL(mpage_writepages); 673
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.