1 // SPDX-License-Identifier: GPL-2.0 1 // SPDX-License-Identifier: GPL-2.0 2 /* 2 /* 3 * fs/mpage.c 3 * fs/mpage.c 4 * 4 * 5 * Copyright (C) 2002, Linus Torvalds. 5 * Copyright (C) 2002, Linus Torvalds. 6 * 6 * 7 * Contains functions related to preparing and 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 8 * multiple pagecache pages. 9 * 9 * 10 * 15May2002 Andrew Morton 10 * 15May2002 Andrew Morton 11 * Initial version 11 * Initial version 12 * 27Jun2002 axboe@suse.de 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bi 13 * use bio_add_page() to build bio's just the right size 14 */ 14 */ 15 15 16 #include <linux/kernel.h> 16 #include <linux/kernel.h> 17 #include <linux/export.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 31 #include <linux/pagevec.h> >> 32 #include <linux/cleancache.h> 32 #include "internal.h" 33 #include "internal.h" 33 34 34 /* 35 /* 35 * I/O completion handler for multipage BIOs. 36 * I/O completion handler for multipage BIOs. 36 * 37 * 37 * The mpage code never puts partial pages int 38 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run 39 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). !! 40 * back to block_read_full_page(). 40 * 41 * 41 * Why is this? If a page's completion depend 42 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the 43 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffe 44 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that c 45 * There is no point in duplicating all that complexity. 45 */ 46 */ 46 static void mpage_read_end_io(struct bio *bio) !! 47 static void mpage_end_io(struct bio *bio) 47 { 48 { 48 struct folio_iter fi; !! 49 struct bio_vec *bv; 49 int err = blk_status_to_errno(bio->bi_ !! 50 int i; 50 51 51 bio_for_each_folio_all(fi, bio) !! 52 bio_for_each_segment_all(bv, bio, i) { 52 folio_end_read(fi.folio, err = !! 53 struct page *page = bv->bv_page; 53 !! 54 page_endio(page, bio_op(bio), 54 bio_put(bio); !! 55 blk_status_to_errno(bio->bi_status)); 55 } << 56 << 57 static void mpage_write_end_io(struct bio *bio << 58 { << 59 struct folio_iter fi; << 60 int err = blk_status_to_errno(bio->bi_ << 61 << 62 bio_for_each_folio_all(fi, bio) { << 63 if (err) << 64 mapping_set_error(fi.f << 65 folio_end_writeback(fi.folio); << 66 } 56 } 67 57 68 bio_put(bio); 58 bio_put(bio); 69 } 59 } 70 60 71 static struct bio *mpage_bio_submit_read(struc !! 61 static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) 72 { 62 { 73 bio->bi_end_io = mpage_read_end_io; !! 63 bio->bi_end_io = mpage_end_io; 74 guard_bio_eod(bio); !! 64 bio_set_op_attrs(bio, op, op_flags); >> 65 guard_bio_eod(op, bio); 75 submit_bio(bio); 66 submit_bio(bio); 76 return NULL; 67 return NULL; 77 } 68 } 78 69 79 static struct bio *mpage_bio_submit_write(stru !! 70 static struct bio * >> 71 mpage_alloc(struct block_device *bdev, >> 72 sector_t first_sector, int nr_vecs, >> 73 gfp_t gfp_flags) 80 { 74 { 81 bio->bi_end_io = mpage_write_end_io; !! 75 struct bio *bio; 82 guard_bio_eod(bio); !! 76 83 submit_bio(bio); !! 77 /* Restrict the given (page cache) mask for slab allocations */ 84 return NULL; !! 78 gfp_flags &= GFP_KERNEL; >> 79 bio = bio_alloc(gfp_flags, nr_vecs); >> 80 >> 81 if (bio == NULL && (current->flags & PF_MEMALLOC)) { >> 82 while (!bio && (nr_vecs /= 2)) >> 83 bio = bio_alloc(gfp_flags, nr_vecs); >> 84 } >> 85 >> 86 if (bio) { >> 87 bio_set_dev(bio, bdev); >> 88 bio->bi_iter.bi_sector = first_sector; >> 89 } >> 90 return bio; 85 } 91 } 86 92 87 /* 93 /* 88 * support function for mpage_readahead. The !! 94 * support function for mpage_readpages. The fs supplied get_block might 89 * return an up to date buffer. This is used 95 * return an up to date buffer. This is used to map that buffer into 90 * the page, which allows read_folio to avoid !! 96 * the page, which allows readpage to avoid triggering a duplicate call 91 * to get_block. 97 * to get_block. 92 * 98 * 93 * The idea is to avoid adding buffers to page 99 * The idea is to avoid adding buffers to pages that don't already have 94 * them. So when the buffer is up to date and 100 * them. So when the buffer is up to date and the page size == block size, 95 * this marks the page up to date instead of a 101 * this marks the page up to date instead of adding new buffers. 96 */ 102 */ 97 static void map_buffer_to_folio(struct folio * !! 103 static void 98 int page_block) !! 104 map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 99 { 105 { 100 struct inode *inode = folio->mapping-> !! 106 struct inode *inode = page->mapping->host; 101 struct buffer_head *page_bh, *head; 107 struct buffer_head *page_bh, *head; 102 int block = 0; 108 int block = 0; 103 109 104 head = folio_buffers(folio); !! 110 if (!page_has_buffers(page)) { 105 if (!head) { << 106 /* 111 /* 107 * don't make any buffers if t 112 * don't make any buffers if there is only one buffer on 108 * the folio and the folio jus !! 113 * the page and the page just needs to be set up to date 109 */ 114 */ 110 if (inode->i_blkbits == PAGE_S 115 if (inode->i_blkbits == PAGE_SHIFT && 111 buffer_uptodate(bh)) { 116 buffer_uptodate(bh)) { 112 folio_mark_uptodate(fo !! 117 SetPageUptodate(page); 113 return; 118 return; 114 } 119 } 115 head = create_empty_buffers(fo !! 120 create_empty_buffers(page, i_blocksize(inode), 0); 116 } 121 } 117 !! 122 head = page_buffers(page); 118 page_bh = head; 123 page_bh = head; 119 do { 124 do { 120 if (block == page_block) { 125 if (block == page_block) { 121 page_bh->b_state = bh- 126 page_bh->b_state = bh->b_state; 122 page_bh->b_bdev = bh-> 127 page_bh->b_bdev = bh->b_bdev; 123 page_bh->b_blocknr = b 128 page_bh->b_blocknr = bh->b_blocknr; 124 break; 129 break; 125 } 130 } 126 page_bh = page_bh->b_this_page 131 page_bh = page_bh->b_this_page; 127 block++; 132 block++; 128 } while (page_bh != head); 133 } while (page_bh != head); 129 } 134 } 130 135 131 struct mpage_readpage_args { 136 struct mpage_readpage_args { 132 struct bio *bio; 137 struct bio *bio; 133 struct folio *folio; !! 138 struct page *page; 134 unsigned int nr_pages; 139 unsigned int nr_pages; 135 bool is_readahead; 140 bool is_readahead; 136 sector_t last_block_in_bio; 141 sector_t last_block_in_bio; 137 struct buffer_head map_bh; 142 struct buffer_head map_bh; 138 unsigned long first_logical_block; 143 unsigned long first_logical_block; 139 get_block_t *get_block; 144 get_block_t *get_block; 140 }; 145 }; 141 146 142 /* 147 /* 143 * This is the worker routine which does all t 148 * This is the worker routine which does all the work of mapping the disk 144 * blocks and constructs largest possible bios 149 * blocks and constructs largest possible bios, submits them for IO if the 145 * blocks are not contiguous on the disk. 150 * blocks are not contiguous on the disk. 146 * 151 * 147 * We pass a buffer_head back and forth and us 152 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 148 * represent the validity of its disk mapping 153 * represent the validity of its disk mapping and to decide when to do the next 149 * get_block() call. 154 * get_block() call. 150 */ 155 */ 151 static struct bio *do_mpage_readpage(struct mp 156 static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) 152 { 157 { 153 struct folio *folio = args->folio; !! 158 struct page *page = args->page; 154 struct inode *inode = folio->mapping-> !! 159 struct inode *inode = page->mapping->host; 155 const unsigned blkbits = inode->i_blkb 160 const unsigned blkbits = inode->i_blkbits; 156 const unsigned blocks_per_page = PAGE_ 161 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 157 const unsigned blocksize = 1 << blkbit 162 const unsigned blocksize = 1 << blkbits; 158 struct buffer_head *map_bh = &args->ma 163 struct buffer_head *map_bh = &args->map_bh; 159 sector_t block_in_file; 164 sector_t block_in_file; 160 sector_t last_block; 165 sector_t last_block; 161 sector_t last_block_in_file; 166 sector_t last_block_in_file; 162 sector_t first_block; !! 167 sector_t blocks[MAX_BUF_PER_PAGE]; 163 unsigned page_block; 168 unsigned page_block; 164 unsigned first_hole = blocks_per_page; 169 unsigned first_hole = blocks_per_page; 165 struct block_device *bdev = NULL; 170 struct block_device *bdev = NULL; 166 int length; 171 int length; 167 int fully_mapped = 1; 172 int fully_mapped = 1; 168 blk_opf_t opf = REQ_OP_READ; !! 173 int op_flags; 169 unsigned nblocks; 174 unsigned nblocks; 170 unsigned relative_block; 175 unsigned relative_block; 171 gfp_t gfp = mapping_gfp_constraint(fol !! 176 gfp_t gfp; 172 << 173 /* MAX_BUF_PER_PAGE, for example */ << 174 VM_BUG_ON_FOLIO(folio_test_large(folio << 175 177 176 if (args->is_readahead) { 178 if (args->is_readahead) { 177 opf |= REQ_RAHEAD; !! 179 op_flags = REQ_RAHEAD; 178 gfp |= __GFP_NORETRY | __GFP_N !! 180 gfp = readahead_gfp_mask(page->mapping); >> 181 } else { >> 182 op_flags = 0; >> 183 gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 179 } 184 } 180 185 181 if (folio_buffers(folio)) !! 186 if (page_has_buffers(page)) 182 goto confused; 187 goto confused; 183 188 184 block_in_file = (sector_t)folio->index !! 189 block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); 185 last_block = block_in_file + args->nr_ 190 last_block = block_in_file + args->nr_pages * blocks_per_page; 186 last_block_in_file = (i_size_read(inod 191 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 187 if (last_block > last_block_in_file) 192 if (last_block > last_block_in_file) 188 last_block = last_block_in_fil 193 last_block = last_block_in_file; 189 page_block = 0; 194 page_block = 0; 190 195 191 /* 196 /* 192 * Map blocks using the result from th 197 * Map blocks using the result from the previous get_blocks call first. 193 */ 198 */ 194 nblocks = map_bh->b_size >> blkbits; 199 nblocks = map_bh->b_size >> blkbits; 195 if (buffer_mapped(map_bh) && 200 if (buffer_mapped(map_bh) && 196 block_in_file > args-> 201 block_in_file > args->first_logical_block && 197 block_in_file < (args- 202 block_in_file < (args->first_logical_block + nblocks)) { 198 unsigned map_offset = block_in 203 unsigned map_offset = block_in_file - args->first_logical_block; 199 unsigned last = nblocks - map_ 204 unsigned last = nblocks - map_offset; 200 205 201 first_block = map_bh->b_blockn << 202 for (relative_block = 0; ; rel 206 for (relative_block = 0; ; relative_block++) { 203 if (relative_block == 207 if (relative_block == last) { 204 clear_buffer_m 208 clear_buffer_mapped(map_bh); 205 break; 209 break; 206 } 210 } 207 if (page_block == bloc 211 if (page_block == blocks_per_page) 208 break; 212 break; >> 213 blocks[page_block] = map_bh->b_blocknr + map_offset + >> 214 relative_block; 209 page_block++; 215 page_block++; 210 block_in_file++; 216 block_in_file++; 211 } 217 } 212 bdev = map_bh->b_bdev; 218 bdev = map_bh->b_bdev; 213 } 219 } 214 220 215 /* 221 /* 216 * Then do more get_blocks calls until !! 222 * Then do more get_blocks calls until we are done with this page. 217 */ 223 */ 218 map_bh->b_folio = folio; !! 224 map_bh->b_page = page; 219 while (page_block < blocks_per_page) { 225 while (page_block < blocks_per_page) { 220 map_bh->b_state = 0; 226 map_bh->b_state = 0; 221 map_bh->b_size = 0; 227 map_bh->b_size = 0; 222 228 223 if (block_in_file < last_block 229 if (block_in_file < last_block) { 224 map_bh->b_size = (last 230 map_bh->b_size = (last_block-block_in_file) << blkbits; 225 if (args->get_block(in 231 if (args->get_block(inode, block_in_file, map_bh, 0)) 226 goto confused; 232 goto confused; 227 args->first_logical_bl 233 args->first_logical_block = block_in_file; 228 } 234 } 229 235 230 if (!buffer_mapped(map_bh)) { 236 if (!buffer_mapped(map_bh)) { 231 fully_mapped = 0; 237 fully_mapped = 0; 232 if (first_hole == bloc 238 if (first_hole == blocks_per_page) 233 first_hole = p 239 first_hole = page_block; 234 page_block++; 240 page_block++; 235 block_in_file++; 241 block_in_file++; 236 continue; 242 continue; 237 } 243 } 238 244 239 /* some filesystems will copy 245 /* some filesystems will copy data into the page during 240 * the get_block call, in whic 246 * the get_block call, in which case we don't want to 241 * read it again. map_buffer_ !! 247 * read it again. map_buffer_to_page copies the data 242 * we just collected from get_ !! 248 * we just collected from get_block into the page's buffers 243 * so read_folio doesn't have !! 249 * so readpage doesn't have to repeat the get_block call 244 */ 250 */ 245 if (buffer_uptodate(map_bh)) { 251 if (buffer_uptodate(map_bh)) { 246 map_buffer_to_folio(fo !! 252 map_buffer_to_page(page, map_bh, page_block); 247 goto confused; 253 goto confused; 248 } 254 } 249 255 250 if (first_hole != blocks_per_p 256 if (first_hole != blocks_per_page) 251 goto confused; 257 goto confused; /* hole -> non-hole */ 252 258 253 /* Contiguous blocks? */ 259 /* Contiguous blocks? */ 254 if (!page_block) !! 260 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) 255 first_block = map_bh-> << 256 else if (first_block + page_bl << 257 goto confused; 261 goto confused; 258 nblocks = map_bh->b_size >> bl 262 nblocks = map_bh->b_size >> blkbits; 259 for (relative_block = 0; ; rel 263 for (relative_block = 0; ; relative_block++) { 260 if (relative_block == 264 if (relative_block == nblocks) { 261 clear_buffer_m 265 clear_buffer_mapped(map_bh); 262 break; 266 break; 263 } else if (page_block 267 } else if (page_block == blocks_per_page) 264 break; 268 break; >> 269 blocks[page_block] = map_bh->b_blocknr+relative_block; 265 page_block++; 270 page_block++; 266 block_in_file++; 271 block_in_file++; 267 } 272 } 268 bdev = map_bh->b_bdev; 273 bdev = map_bh->b_bdev; 269 } 274 } 270 275 271 if (first_hole != blocks_per_page) { 276 if (first_hole != blocks_per_page) { 272 folio_zero_segment(folio, firs !! 277 zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); 273 if (first_hole == 0) { 278 if (first_hole == 0) { 274 folio_mark_uptodate(fo !! 279 SetPageUptodate(page); 275 folio_unlock(folio); !! 280 unlock_page(page); 276 goto out; 281 goto out; 277 } 282 } 278 } else if (fully_mapped) { 283 } else if (fully_mapped) { 279 folio_set_mappedtodisk(folio); !! 284 SetPageMappedToDisk(page); >> 285 } >> 286 >> 287 if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && >> 288 cleancache_get_page(page) == 0) { >> 289 SetPageUptodate(page); >> 290 goto confused; 280 } 291 } 281 292 282 /* 293 /* 283 * This folio will go to BIO. Do we n !! 294 * This page will go to BIO. Do we need to send this BIO off first? 284 */ 295 */ 285 if (args->bio && (args->last_block_in_ !! 296 if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) 286 args->bio = mpage_bio_submit_r !! 297 args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); 287 298 288 alloc_new: 299 alloc_new: 289 if (args->bio == NULL) { 300 if (args->bio == NULL) { 290 args->bio = bio_alloc(bdev, bi !! 301 if (first_hole == blocks_per_page) { 291 gfp); !! 302 if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), >> 303 page)) >> 304 goto out; >> 305 } >> 306 args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), >> 307 min_t(int, args->nr_pages, >> 308 BIO_MAX_PAGES), >> 309 gfp); 292 if (args->bio == NULL) 310 if (args->bio == NULL) 293 goto confused; 311 goto confused; 294 args->bio->bi_iter.bi_sector = << 295 } 312 } 296 313 297 length = first_hole << blkbits; 314 length = first_hole << blkbits; 298 if (!bio_add_folio(args->bio, folio, l !! 315 if (bio_add_page(args->bio, page, length, 0) < length) { 299 args->bio = mpage_bio_submit_r !! 316 args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); 300 goto alloc_new; 317 goto alloc_new; 301 } 318 } 302 319 303 relative_block = block_in_file - args- 320 relative_block = block_in_file - args->first_logical_block; 304 nblocks = map_bh->b_size >> blkbits; 321 nblocks = map_bh->b_size >> blkbits; 305 if ((buffer_boundary(map_bh) && relati 322 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 306 (first_hole != blocks_per_page)) 323 (first_hole != blocks_per_page)) 307 args->bio = mpage_bio_submit_r !! 324 args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); 308 else 325 else 309 args->last_block_in_bio = firs !! 326 args->last_block_in_bio = blocks[blocks_per_page - 1]; 310 out: 327 out: 311 return args->bio; 328 return args->bio; 312 329 313 confused: 330 confused: 314 if (args->bio) 331 if (args->bio) 315 args->bio = mpage_bio_submit_r !! 332 args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); 316 if (!folio_test_uptodate(folio)) !! 333 if (!PageUptodate(page)) 317 block_read_full_folio(folio, a !! 334 block_read_full_page(page, args->get_block); 318 else 335 else 319 folio_unlock(folio); !! 336 unlock_page(page); 320 goto out; 337 goto out; 321 } 338 } 322 339 323 /** 340 /** 324 * mpage_readahead - start reads against pages !! 341 * mpage_readpages - populate an address space with some pages & start reads against them 325 * @rac: Describes which pages to read. !! 342 * @mapping: the address_space >> 343 * @pages: The address of a list_head which contains the target pages. These >> 344 * pages have their ->index populated and are otherwise uninitialised. >> 345 * The page at @pages->prev has the lowest file offset, and reads should be >> 346 * issued in @pages->prev to @pages->next order. >> 347 * @nr_pages: The number of pages at *@pages 326 * @get_block: The filesystem's block mapper f 348 * @get_block: The filesystem's block mapper function. 327 * 349 * 328 * This function walks the pages and the block 350 * This function walks the pages and the blocks within each page, building and 329 * emitting large BIOs. 351 * emitting large BIOs. 330 * 352 * 331 * If anything unusual happens, such as: 353 * If anything unusual happens, such as: 332 * 354 * 333 * - encountering a page which has buffers 355 * - encountering a page which has buffers 334 * - encountering a page which has a non-hole 356 * - encountering a page which has a non-hole after a hole 335 * - encountering a page with non-contiguous b 357 * - encountering a page with non-contiguous blocks 336 * 358 * 337 * then this code just gives up and calls the 359 * then this code just gives up and calls the buffer_head-based read function. 338 * It does handle a page which has holes at th 360 * It does handle a page which has holes at the end - that is a common case: 339 * the end-of-file on blocksize < PAGE_SIZE se 361 * the end-of-file on blocksize < PAGE_SIZE setups. 340 * 362 * 341 * BH_Boundary explanation: 363 * BH_Boundary explanation: 342 * 364 * 343 * There is a problem. The mpage read code as 365 * There is a problem. The mpage read code assembles several pages, gets all 344 * their disk mappings, and then submits them 366 * their disk mappings, and then submits them all. That's fine, but obtaining 345 * the disk mappings may require I/O. Reads o 367 * the disk mappings may require I/O. Reads of indirect blocks, for example. 346 * 368 * 347 * So an mpage read of the first 16 blocks of 369 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 348 * submitted in the following order: 370 * submitted in the following order: 349 * 371 * 350 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 372 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 351 * 373 * 352 * because the indirect block has to be read t 374 * because the indirect block has to be read to get the mappings of blocks 353 * 13,14,15,16. Obviously, this impacts perfo 375 * 13,14,15,16. Obviously, this impacts performance. 354 * 376 * 355 * So what we do it to allow the filesystem's 377 * So what we do it to allow the filesystem's get_block() function to set 356 * BH_Boundary when it maps block 11. BH_Boun 378 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 357 * after this one will require I/O against a b 379 * after this one will require I/O against a block which is probably close to 358 * this one. So you should push what I/O you 380 * this one. So you should push what I/O you have currently accumulated. 359 * 381 * 360 * This all causes the disk requests to be iss 382 * This all causes the disk requests to be issued in the correct order. 361 */ 383 */ 362 void mpage_readahead(struct readahead_control !! 384 int >> 385 mpage_readpages(struct address_space *mapping, struct list_head *pages, >> 386 unsigned nr_pages, get_block_t get_block) 363 { 387 { 364 struct folio *folio; << 365 struct mpage_readpage_args args = { 388 struct mpage_readpage_args args = { 366 .get_block = get_block, 389 .get_block = get_block, 367 .is_readahead = true, 390 .is_readahead = true, 368 }; 391 }; >> 392 unsigned page_idx; 369 393 370 while ((folio = readahead_folio(rac))) !! 394 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 371 prefetchw(&folio->flags); !! 395 struct page *page = lru_to_page(pages); 372 args.folio = folio; !! 396 373 args.nr_pages = readahead_coun !! 397 prefetchw(&page->flags); 374 args.bio = do_mpage_readpage(& !! 398 list_del(&page->lru); >> 399 if (!add_to_page_cache_lru(page, mapping, >> 400 page->index, >> 401 readahead_gfp_mask(mapping))) { >> 402 args.page = page; >> 403 args.nr_pages = nr_pages - page_idx; >> 404 args.bio = do_mpage_readpage(&args); >> 405 } >> 406 put_page(page); 375 } 407 } >> 408 BUG_ON(!list_empty(pages)); 376 if (args.bio) 409 if (args.bio) 377 mpage_bio_submit_read(args.bio !! 410 mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); >> 411 return 0; 378 } 412 } 379 EXPORT_SYMBOL(mpage_readahead); !! 413 EXPORT_SYMBOL(mpage_readpages); 380 414 381 /* 415 /* 382 * This isn't called much at all 416 * This isn't called much at all 383 */ 417 */ 384 int mpage_read_folio(struct folio *folio, get_ !! 418 int mpage_readpage(struct page *page, get_block_t get_block) 385 { 419 { 386 struct mpage_readpage_args args = { 420 struct mpage_readpage_args args = { 387 .folio = folio, !! 421 .page = page, 388 .nr_pages = 1, 422 .nr_pages = 1, 389 .get_block = get_block, 423 .get_block = get_block, 390 }; 424 }; 391 425 392 args.bio = do_mpage_readpage(&args); 426 args.bio = do_mpage_readpage(&args); 393 if (args.bio) 427 if (args.bio) 394 mpage_bio_submit_read(args.bio !! 428 mpage_bio_submit(REQ_OP_READ, 0, args.bio); 395 return 0; 429 return 0; 396 } 430 } 397 EXPORT_SYMBOL(mpage_read_folio); !! 431 EXPORT_SYMBOL(mpage_readpage); 398 432 399 /* 433 /* 400 * Writing is not so simple. 434 * Writing is not so simple. 401 * 435 * 402 * If the page has buffers then they will be u 436 * If the page has buffers then they will be used for obtaining the disk 403 * mapping. We only support pages which are f 437 * mapping. We only support pages which are fully mapped-and-dirty, with a 404 * special case for pages which are unmapped a 438 * special case for pages which are unmapped at the end: end-of-file. 405 * 439 * 406 * If the page has no buffers (preferred) then 440 * If the page has no buffers (preferred) then the page is mapped here. 407 * 441 * 408 * If all blocks are found to be contiguous th 442 * If all blocks are found to be contiguous then the page can go into the 409 * BIO. Otherwise fall back to the mapping's 443 * BIO. Otherwise fall back to the mapping's writepage(). 410 * 444 * 411 * FIXME: This code wants an estimate of how m 445 * FIXME: This code wants an estimate of how many pages are still to be 412 * written, so it can intelligently allocate a 446 * written, so it can intelligently allocate a suitably-sized BIO. For now, 413 * just allocate full-size (16-page) BIOs. 447 * just allocate full-size (16-page) BIOs. 414 */ 448 */ 415 449 416 struct mpage_data { 450 struct mpage_data { 417 struct bio *bio; 451 struct bio *bio; 418 sector_t last_block_in_bio; 452 sector_t last_block_in_bio; 419 get_block_t *get_block; 453 get_block_t *get_block; >> 454 unsigned use_writepage; 420 }; 455 }; 421 456 422 /* 457 /* 423 * We have our BIO, so we can now mark the buf 458 * We have our BIO, so we can now mark the buffers clean. Make 424 * sure to only clean buffers which we know we 459 * sure to only clean buffers which we know we'll be writing. 425 */ 460 */ 426 static void clean_buffers(struct folio *folio, !! 461 static void clean_buffers(struct page *page, unsigned first_unmapped) 427 { 462 { 428 unsigned buffer_counter = 0; 463 unsigned buffer_counter = 0; 429 struct buffer_head *bh, *head = folio_ !! 464 struct buffer_head *bh, *head; 430 !! 465 if (!page_has_buffers(page)) 431 if (!head) << 432 return; 466 return; >> 467 head = page_buffers(page); 433 bh = head; 468 bh = head; 434 469 435 do { 470 do { 436 if (buffer_counter++ == first_ 471 if (buffer_counter++ == first_unmapped) 437 break; 472 break; 438 clear_buffer_dirty(bh); 473 clear_buffer_dirty(bh); 439 bh = bh->b_this_page; 474 bh = bh->b_this_page; 440 } while (bh != head); 475 } while (bh != head); 441 476 442 /* 477 /* 443 * we cannot drop the bh if the page i 478 * we cannot drop the bh if the page is not uptodate or a concurrent 444 * read_folio would fail to serialize !! 479 * readpage would fail to serialize with the bh and it would read from 445 * disk before we reach the platter. 480 * disk before we reach the platter. 446 */ 481 */ 447 if (buffer_heads_over_limit && folio_t !! 482 if (buffer_heads_over_limit && PageUptodate(page)) 448 try_to_free_buffers(folio); !! 483 try_to_free_buffers(page); 449 } 484 } 450 485 451 static int __mpage_writepage(struct folio *fol !! 486 /* >> 487 * For situations where we want to clean all buffers attached to a page. >> 488 * We don't need to calculate how many buffers are attached to the page, >> 489 * we just need to specify a number larger than the maximum number of buffers. >> 490 */ >> 491 void clean_page_buffers(struct page *page) >> 492 { >> 493 clean_buffers(page, ~0U); >> 494 } >> 495 >> 496 static int __mpage_writepage(struct page *page, struct writeback_control *wbc, 452 void *data) 497 void *data) 453 { 498 { 454 struct mpage_data *mpd = data; 499 struct mpage_data *mpd = data; 455 struct bio *bio = mpd->bio; 500 struct bio *bio = mpd->bio; 456 struct address_space *mapping = folio- !! 501 struct address_space *mapping = page->mapping; 457 struct inode *inode = mapping->host; !! 502 struct inode *inode = page->mapping->host; 458 const unsigned blkbits = inode->i_blkb 503 const unsigned blkbits = inode->i_blkbits; >> 504 unsigned long end_index; 459 const unsigned blocks_per_page = PAGE_ 505 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 460 sector_t last_block; 506 sector_t last_block; 461 sector_t block_in_file; 507 sector_t block_in_file; 462 sector_t first_block; !! 508 sector_t blocks[MAX_BUF_PER_PAGE]; 463 unsigned page_block; 509 unsigned page_block; 464 unsigned first_unmapped = blocks_per_p 510 unsigned first_unmapped = blocks_per_page; 465 struct block_device *bdev = NULL; 511 struct block_device *bdev = NULL; 466 int boundary = 0; 512 int boundary = 0; 467 sector_t boundary_block = 0; 513 sector_t boundary_block = 0; 468 struct block_device *boundary_bdev = N 514 struct block_device *boundary_bdev = NULL; 469 size_t length; !! 515 int length; 470 struct buffer_head map_bh; 516 struct buffer_head map_bh; 471 loff_t i_size = i_size_read(inode); 517 loff_t i_size = i_size_read(inode); 472 int ret = 0; 518 int ret = 0; 473 struct buffer_head *head = folio_buffe !! 519 int op_flags = wbc_to_write_flags(wbc); 474 520 475 if (head) { !! 521 if (page_has_buffers(page)) { >> 522 struct buffer_head *head = page_buffers(page); 476 struct buffer_head *bh = head; 523 struct buffer_head *bh = head; 477 524 478 /* If they're all mapped and d 525 /* If they're all mapped and dirty, do it */ 479 page_block = 0; 526 page_block = 0; 480 do { 527 do { 481 BUG_ON(buffer_locked(b 528 BUG_ON(buffer_locked(bh)); 482 if (!buffer_mapped(bh) 529 if (!buffer_mapped(bh)) { 483 /* 530 /* 484 * unmapped di 531 * unmapped dirty buffers are created by 485 * block_dirty !! 532 * __set_page_dirty_buffers -> mmapped data 486 */ 533 */ 487 if (buffer_dir 534 if (buffer_dirty(bh)) 488 goto c 535 goto confused; 489 if (first_unma 536 if (first_unmapped == blocks_per_page) 490 first_ 537 first_unmapped = page_block; 491 continue; 538 continue; 492 } 539 } 493 540 494 if (first_unmapped != 541 if (first_unmapped != blocks_per_page) 495 goto confused; 542 goto confused; /* hole -> non-hole */ 496 543 497 if (!buffer_dirty(bh) 544 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 498 goto confused; 545 goto confused; 499 if (page_block) { 546 if (page_block) { 500 if (bh->b_bloc !! 547 if (bh->b_blocknr != blocks[page_block-1] + 1) 501 goto c 548 goto confused; 502 } else { << 503 first_block = << 504 } 549 } 505 page_block++; !! 550 blocks[page_block++] = bh->b_blocknr; 506 boundary = buffer_boun 551 boundary = buffer_boundary(bh); 507 if (boundary) { 552 if (boundary) { 508 boundary_block 553 boundary_block = bh->b_blocknr; 509 boundary_bdev 554 boundary_bdev = bh->b_bdev; 510 } 555 } 511 bdev = bh->b_bdev; 556 bdev = bh->b_bdev; 512 } while ((bh = bh->b_this_page 557 } while ((bh = bh->b_this_page) != head); 513 558 514 if (first_unmapped) 559 if (first_unmapped) 515 goto page_is_mapped; 560 goto page_is_mapped; 516 561 517 /* 562 /* 518 * Page has buffers, but they 563 * Page has buffers, but they are all unmapped. The page was 519 * created by pagein or read o 564 * created by pagein or read over a hole which was handled by 520 * block_read_full_folio(). I !! 565 * block_read_full_page(). If this address_space is also 521 * using mpage_readahead then !! 566 * using mpage_readpages then this can rarely happen. 522 */ 567 */ 523 goto confused; 568 goto confused; 524 } 569 } 525 570 526 /* 571 /* 527 * The page has no buffers: map it to 572 * The page has no buffers: map it to disk 528 */ 573 */ 529 BUG_ON(!folio_test_uptodate(folio)); !! 574 BUG_ON(!PageUptodate(page)); 530 block_in_file = (sector_t)folio->index !! 575 block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); 531 /* << 532 * Whole page beyond EOF? Skip allocat << 533 * space. << 534 */ << 535 if (block_in_file >= (i_size + (1 << b << 536 goto page_is_mapped; << 537 last_block = (i_size - 1) >> blkbits; 576 last_block = (i_size - 1) >> blkbits; 538 map_bh.b_folio = folio; !! 577 map_bh.b_page = page; 539 for (page_block = 0; page_block < bloc 578 for (page_block = 0; page_block < blocks_per_page; ) { 540 579 541 map_bh.b_state = 0; 580 map_bh.b_state = 0; 542 map_bh.b_size = 1 << blkbits; 581 map_bh.b_size = 1 << blkbits; 543 if (mpd->get_block(inode, bloc 582 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 544 goto confused; 583 goto confused; 545 if (!buffer_mapped(&map_bh)) << 546 goto confused; << 547 if (buffer_new(&map_bh)) 584 if (buffer_new(&map_bh)) 548 clean_bdev_bh_alias(&m 585 clean_bdev_bh_alias(&map_bh); 549 if (buffer_boundary(&map_bh)) 586 if (buffer_boundary(&map_bh)) { 550 boundary_block = map_b 587 boundary_block = map_bh.b_blocknr; 551 boundary_bdev = map_bh 588 boundary_bdev = map_bh.b_bdev; 552 } 589 } 553 if (page_block) { 590 if (page_block) { 554 if (map_bh.b_blocknr ! !! 591 if (map_bh.b_blocknr != blocks[page_block-1] + 1) 555 goto confused; 592 goto confused; 556 } else { << 557 first_block = map_bh.b << 558 } 593 } 559 page_block++; !! 594 blocks[page_block++] = map_bh.b_blocknr; 560 boundary = buffer_boundary(&ma 595 boundary = buffer_boundary(&map_bh); 561 bdev = map_bh.b_bdev; 596 bdev = map_bh.b_bdev; 562 if (block_in_file == last_bloc 597 if (block_in_file == last_block) 563 break; 598 break; 564 block_in_file++; 599 block_in_file++; 565 } 600 } 566 BUG_ON(page_block == 0); 601 BUG_ON(page_block == 0); 567 602 568 first_unmapped = page_block; 603 first_unmapped = page_block; 569 604 570 page_is_mapped: 605 page_is_mapped: 571 /* Don't bother writing beyond EOF, tr !! 606 end_index = i_size >> PAGE_SHIFT; 572 if (folio_pos(folio) >= i_size) !! 607 if (page->index >= end_index) { 573 goto confused; << 574 length = folio_size(folio); << 575 if (folio_pos(folio) + length > i_size << 576 /* 608 /* 577 * The page straddles i_size. 609 * The page straddles i_size. It must be zeroed out on each 578 * and every writepage invocat 610 * and every writepage invocation because it may be mmapped. 579 * "A file is mapped in multip 611 * "A file is mapped in multiples of the page size. For a file 580 * that is not a multiple of t 612 * that is not a multiple of the page size, the remaining memory 581 * is zeroed when mapped, and 613 * is zeroed when mapped, and writes to that region are not 582 * written out to the file." 614 * written out to the file." 583 */ 615 */ 584 length = i_size - folio_pos(fo !! 616 unsigned offset = i_size & (PAGE_SIZE - 1); 585 folio_zero_segment(folio, leng !! 617 >> 618 if (page->index > end_index || !offset) >> 619 goto confused; >> 620 zero_user_segment(page, offset, PAGE_SIZE); 586 } 621 } 587 622 588 /* 623 /* 589 * This page will go to BIO. Do we ne 624 * This page will go to BIO. Do we need to send this BIO off first? 590 */ 625 */ 591 if (bio && mpd->last_block_in_bio != f !! 626 if (bio && mpd->last_block_in_bio != blocks[0] - 1) 592 bio = mpage_bio_submit_write(b !! 627 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); 593 628 594 alloc_new: 629 alloc_new: 595 if (bio == NULL) { 630 if (bio == NULL) { 596 bio = bio_alloc(bdev, BIO_MAX_ !! 631 if (first_unmapped == blocks_per_page) { 597 REQ_OP_WRITE | !! 632 if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), 598 GFP_NOFS); !! 633 page, wbc)) 599 bio->bi_iter.bi_sector = first !! 634 goto out; >> 635 } >> 636 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), >> 637 BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); >> 638 if (bio == NULL) >> 639 goto confused; >> 640 600 wbc_init_bio(wbc, bio); 641 wbc_init_bio(wbc, bio); 601 bio->bi_write_hint = inode->i_ 642 bio->bi_write_hint = inode->i_write_hint; 602 } 643 } 603 644 604 /* 645 /* 605 * Must try to add the page before mar 646 * Must try to add the page before marking the buffer clean or 606 * the confused fail path above (OOM) 647 * the confused fail path above (OOM) will be very confused when 607 * it finds all bh marked clean (i.e. 648 * it finds all bh marked clean (i.e. it will not write anything) 608 */ 649 */ 609 wbc_account_cgroup_owner(wbc, &folio-> !! 650 wbc_account_io(wbc, page, PAGE_SIZE); 610 length = first_unmapped << blkbits; 651 length = first_unmapped << blkbits; 611 if (!bio_add_folio(bio, folio, length, !! 652 if (bio_add_page(bio, page, length, 0) < length) { 612 bio = mpage_bio_submit_write(b !! 653 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); 613 goto alloc_new; 654 goto alloc_new; 614 } 655 } 615 656 616 clean_buffers(folio, first_unmapped); !! 657 clean_buffers(page, first_unmapped); 617 658 618 BUG_ON(folio_test_writeback(folio)); !! 659 BUG_ON(PageWriteback(page)); 619 folio_start_writeback(folio); !! 660 set_page_writeback(page); 620 folio_unlock(folio); !! 661 unlock_page(page); 621 if (boundary || (first_unmapped != blo 662 if (boundary || (first_unmapped != blocks_per_page)) { 622 bio = mpage_bio_submit_write(b !! 663 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); 623 if (boundary_block) { 664 if (boundary_block) { 624 write_boundary_block(b 665 write_boundary_block(boundary_bdev, 625 bounda 666 boundary_block, 1 << blkbits); 626 } 667 } 627 } else { 668 } else { 628 mpd->last_block_in_bio = first !! 669 mpd->last_block_in_bio = blocks[blocks_per_page - 1]; 629 } 670 } 630 goto out; 671 goto out; 631 672 632 confused: 673 confused: 633 if (bio) 674 if (bio) 634 bio = mpage_bio_submit_write(b !! 675 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); 635 676 >> 677 if (mpd->use_writepage) { >> 678 ret = mapping->a_ops->writepage(page, wbc); >> 679 } else { >> 680 ret = -EAGAIN; >> 681 goto out; >> 682 } 636 /* 683 /* 637 * The caller has a ref on the inode, 684 * The caller has a ref on the inode, so *mapping is stable 638 */ 685 */ 639 ret = block_write_full_folio(folio, wb << 640 mapping_set_error(mapping, ret); 686 mapping_set_error(mapping, ret); 641 out: 687 out: 642 mpd->bio = bio; 688 mpd->bio = bio; 643 return ret; 689 return ret; 644 } 690 } 645 691 646 /** 692 /** 647 * mpage_writepages - walk the list of dirty p 693 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 648 * @mapping: address space structure to write 694 * @mapping: address space structure to write 649 * @wbc: subtract the number of written pages 695 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 650 * @get_block: the filesystem's block mapper f 696 * @get_block: the filesystem's block mapper function. >> 697 * If this is NULL then use a_ops->writepage. Otherwise, go >> 698 * direct-to-BIO. 651 * 699 * 652 * This is a library function, which implement 700 * This is a library function, which implements the writepages() 653 * address_space_operation. 701 * address_space_operation. >> 702 * >> 703 * If a page is already under I/O, generic_writepages() skips it, even >> 704 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, >> 705 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() >> 706 * and msync() need to guarantee that all the data which was dirty at the time >> 707 * the call was made get new I/O started against them. If wbc->sync_mode is >> 708 * WB_SYNC_ALL then we were called for data integrity and we must wait for >> 709 * existing IO to complete. 654 */ 710 */ 655 int 711 int 656 mpage_writepages(struct address_space *mapping 712 mpage_writepages(struct address_space *mapping, 657 struct writeback_control *wbc, 713 struct writeback_control *wbc, get_block_t get_block) 658 { 714 { 659 struct mpage_data mpd = { << 660 .get_block = get_block, << 661 }; << 662 struct blk_plug plug; 715 struct blk_plug plug; 663 int ret; 716 int ret; 664 717 665 blk_start_plug(&plug); 718 blk_start_plug(&plug); 666 ret = write_cache_pages(mapping, wbc, !! 719 667 if (mpd.bio) !! 720 if (!get_block) 668 mpage_bio_submit_write(mpd.bio !! 721 ret = generic_writepages(mapping, wbc); >> 722 else { >> 723 struct mpage_data mpd = { >> 724 .bio = NULL, >> 725 .last_block_in_bio = 0, >> 726 .get_block = get_block, >> 727 .use_writepage = 1, >> 728 }; >> 729 >> 730 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); >> 731 if (mpd.bio) { >> 732 int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? >> 733 REQ_SYNC : 0); >> 734 mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); >> 735 } >> 736 } 669 blk_finish_plug(&plug); 737 blk_finish_plug(&plug); 670 return ret; 738 return ret; 671 } 739 } 672 EXPORT_SYMBOL(mpage_writepages); 740 EXPORT_SYMBOL(mpage_writepages); >> 741 >> 742 int mpage_writepage(struct page *page, get_block_t get_block, >> 743 struct writeback_control *wbc) >> 744 { >> 745 struct mpage_data mpd = { >> 746 .bio = NULL, >> 747 .last_block_in_bio = 0, >> 748 .get_block = get_block, >> 749 .use_writepage = 0, >> 750 }; >> 751 int ret = __mpage_writepage(page, wbc, &mpd); >> 752 if (mpd.bio) { >> 753 int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? >> 754 REQ_SYNC : 0); >> 755 mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); >> 756 } >> 757 return ret; >> 758 } >> 759 EXPORT_SYMBOL(mpage_writepage); 673 760
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.