~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/bio.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright (C) 2007 Oracle.  All rights reserved.
  4  * Copyright (C) 2022 Christoph Hellwig.
  5  */
  6 
  7 #include <linux/bio.h>
  8 #include "bio.h"
  9 #include "ctree.h"
 10 #include "volumes.h"
 11 #include "raid56.h"
 12 #include "async-thread.h"
 13 #include "dev-replace.h"
 14 #include "zoned.h"
 15 #include "file-item.h"
 16 #include "raid-stripe-tree.h"
 17 
 18 static struct bio_set btrfs_bioset;
 19 static struct bio_set btrfs_clone_bioset;
 20 static struct bio_set btrfs_repair_bioset;
 21 static mempool_t btrfs_failed_bio_pool;
 22 
 23 struct btrfs_failed_bio {
 24         struct btrfs_bio *bbio;
 25         int num_copies;
 26         atomic_t repair_count;
 27 };
 28 
 29 /* Is this a data path I/O that needs storage layer checksum and repair? */
 30 static inline bool is_data_bbio(struct btrfs_bio *bbio)
 31 {
 32         return bbio->inode && is_data_inode(bbio->inode);
 33 }
 34 
 35 static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
 36 {
 37         return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
 38 }
 39 
 40 /*
 41  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
 42  * is already initialized by the block layer.
 43  */
 44 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 45                     btrfs_bio_end_io_t end_io, void *private)
 46 {
 47         memset(bbio, 0, offsetof(struct btrfs_bio, bio));
 48         bbio->fs_info = fs_info;
 49         bbio->end_io = end_io;
 50         bbio->private = private;
 51         atomic_set(&bbio->pending_ios, 1);
 52 }
 53 
 54 /*
 55  * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
 56  * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
 57  *
 58  * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
 59  * a mempool.
 60  */
 61 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
 62                                   struct btrfs_fs_info *fs_info,
 63                                   btrfs_bio_end_io_t end_io, void *private)
 64 {
 65         struct btrfs_bio *bbio;
 66         struct bio *bio;
 67 
 68         bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
 69         bbio = btrfs_bio(bio);
 70         btrfs_bio_init(bbio, fs_info, end_io, private);
 71         return bbio;
 72 }
 73 
 74 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
 75                                          struct btrfs_bio *orig_bbio,
 76                                          u64 map_length, bool use_append)
 77 {
 78         struct btrfs_bio *bbio;
 79         struct bio *bio;
 80 
 81         if (use_append) {
 82                 unsigned int nr_segs;
 83 
 84                 bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
 85                                    &btrfs_clone_bioset, map_length);
 86         } else {
 87                 bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
 88                                 GFP_NOFS, &btrfs_clone_bioset);
 89         }
 90         bbio = btrfs_bio(bio);
 91         btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
 92         bbio->inode = orig_bbio->inode;
 93         bbio->file_offset = orig_bbio->file_offset;
 94         orig_bbio->file_offset += map_length;
 95         if (bbio_has_ordered_extent(bbio)) {
 96                 refcount_inc(&orig_bbio->ordered->refs);
 97                 bbio->ordered = orig_bbio->ordered;
 98         }
 99         atomic_inc(&orig_bbio->pending_ios);
100         return bbio;
101 }
102 
103 /* Free a bio that was never submitted to the underlying device. */
104 static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
105 {
106         if (bbio_has_ordered_extent(bbio))
107                 btrfs_put_ordered_extent(bbio->ordered);
108         bio_put(&bbio->bio);
109 }
110 
111 static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
112 {
113         if (bbio_has_ordered_extent(bbio)) {
114                 struct btrfs_ordered_extent *ordered = bbio->ordered;
115 
116                 bbio->end_io(bbio);
117                 btrfs_put_ordered_extent(ordered);
118         } else {
119                 bbio->end_io(bbio);
120         }
121 }
122 
123 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
124 {
125         bbio->bio.bi_status = status;
126         __btrfs_bio_end_io(bbio);
127 }
128 
129 static void btrfs_orig_write_end_io(struct bio *bio);
130 
131 static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
132                                        struct btrfs_bio *orig_bbio)
133 {
134         /*
135          * For writes we tolerate nr_mirrors - 1 write failures, so we can't
136          * just blindly propagate a write failure here.  Instead increment the
137          * error count in the original I/O context so that it is guaranteed to
138          * be larger than the error tolerance.
139          */
140         if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
141                 struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
142                 struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
143 
144                 atomic_add(orig_bioc->max_errors, &orig_bioc->error);
145         } else {
146                 orig_bbio->bio.bi_status = bbio->bio.bi_status;
147         }
148 }
149 
150 static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
151 {
152         if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
153                 struct btrfs_bio *orig_bbio = bbio->private;
154 
155                 if (bbio->bio.bi_status)
156                         btrfs_bbio_propagate_error(bbio, orig_bbio);
157                 btrfs_cleanup_bio(bbio);
158                 bbio = orig_bbio;
159         }
160 
161         if (atomic_dec_and_test(&bbio->pending_ios))
162                 __btrfs_bio_end_io(bbio);
163 }
164 
165 static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
166 {
167         if (cur_mirror == fbio->num_copies)
168                 return cur_mirror + 1 - fbio->num_copies;
169         return cur_mirror + 1;
170 }
171 
172 static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
173 {
174         if (cur_mirror == 1)
175                 return fbio->num_copies;
176         return cur_mirror - 1;
177 }
178 
179 static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
180 {
181         if (atomic_dec_and_test(&fbio->repair_count)) {
182                 btrfs_orig_bbio_end_io(fbio->bbio);
183                 mempool_free(fbio, &btrfs_failed_bio_pool);
184         }
185 }
186 
187 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
188                                  struct btrfs_device *dev)
189 {
190         struct btrfs_failed_bio *fbio = repair_bbio->private;
191         struct btrfs_inode *inode = repair_bbio->inode;
192         struct btrfs_fs_info *fs_info = inode->root->fs_info;
193         struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
194         int mirror = repair_bbio->mirror_num;
195 
196         /*
197          * We can only trigger this for data bio, which doesn't support larger
198          * folios yet.
199          */
200         ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
201 
202         if (repair_bbio->bio.bi_status ||
203             !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
204                 bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
205                 repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
206 
207                 mirror = next_repair_mirror(fbio, mirror);
208                 if (mirror == fbio->bbio->mirror_num) {
209                         btrfs_debug(fs_info, "no mirror left");
210                         fbio->bbio->bio.bi_status = BLK_STS_IOERR;
211                         goto done;
212                 }
213 
214                 btrfs_submit_bio(repair_bbio, mirror);
215                 return;
216         }
217 
218         do {
219                 mirror = prev_repair_mirror(fbio, mirror);
220                 btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
221                                   repair_bbio->file_offset, fs_info->sectorsize,
222                                   repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
223                                   page_folio(bv->bv_page), bv->bv_offset, mirror);
224         } while (mirror != fbio->bbio->mirror_num);
225 
226 done:
227         btrfs_repair_done(fbio);
228         bio_put(&repair_bbio->bio);
229 }
230 
231 /*
232  * Try to kick off a repair read to the next available mirror for a bad sector.
233  *
234  * This primarily tries to recover good data to serve the actual read request,
235  * but also tries to write the good data back to the bad mirror(s) when a
236  * read succeeded to restore the redundancy.
237  */
238 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
239                                                   u32 bio_offset,
240                                                   struct bio_vec *bv,
241                                                   struct btrfs_failed_bio *fbio)
242 {
243         struct btrfs_inode *inode = failed_bbio->inode;
244         struct btrfs_fs_info *fs_info = inode->root->fs_info;
245         const u32 sectorsize = fs_info->sectorsize;
246         const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
247         struct btrfs_bio *repair_bbio;
248         struct bio *repair_bio;
249         int num_copies;
250         int mirror;
251 
252         btrfs_debug(fs_info, "repair read error: read error at %llu",
253                     failed_bbio->file_offset + bio_offset);
254 
255         num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
256         if (num_copies == 1) {
257                 btrfs_debug(fs_info, "no copy to repair from");
258                 failed_bbio->bio.bi_status = BLK_STS_IOERR;
259                 return fbio;
260         }
261 
262         if (!fbio) {
263                 fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
264                 fbio->bbio = failed_bbio;
265                 fbio->num_copies = num_copies;
266                 atomic_set(&fbio->repair_count, 1);
267         }
268 
269         atomic_inc(&fbio->repair_count);
270 
271         repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
272                                       &btrfs_repair_bioset);
273         repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
274         __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
275 
276         repair_bbio = btrfs_bio(repair_bio);
277         btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
278         repair_bbio->inode = failed_bbio->inode;
279         repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
280 
281         mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
282         btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
283         btrfs_submit_bio(repair_bbio, mirror);
284         return fbio;
285 }
286 
287 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
288 {
289         struct btrfs_inode *inode = bbio->inode;
290         struct btrfs_fs_info *fs_info = inode->root->fs_info;
291         u32 sectorsize = fs_info->sectorsize;
292         struct bvec_iter *iter = &bbio->saved_iter;
293         blk_status_t status = bbio->bio.bi_status;
294         struct btrfs_failed_bio *fbio = NULL;
295         u32 offset = 0;
296 
297         /* Read-repair requires the inode field to be set by the submitter. */
298         ASSERT(inode);
299 
300         /*
301          * Hand off repair bios to the repair code as there is no upper level
302          * submitter for them.
303          */
304         if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
305                 btrfs_end_repair_bio(bbio, dev);
306                 return;
307         }
308 
309         /* Clear the I/O error. A failed repair will reset it. */
310         bbio->bio.bi_status = BLK_STS_OK;
311 
312         while (iter->bi_size) {
313                 struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
314 
315                 bv.bv_len = min(bv.bv_len, sectorsize);
316                 if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
317                         fbio = repair_one_sector(bbio, offset, &bv, fbio);
318 
319                 bio_advance_iter_single(&bbio->bio, iter, sectorsize);
320                 offset += sectorsize;
321         }
322 
323         if (bbio->csum != bbio->csum_inline)
324                 kfree(bbio->csum);
325 
326         if (fbio)
327                 btrfs_repair_done(fbio);
328         else
329                 btrfs_orig_bbio_end_io(bbio);
330 }
331 
332 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
333 {
334         if (!dev || !dev->bdev)
335                 return;
336         if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
337                 return;
338 
339         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
340                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
341         else if (!(bio->bi_opf & REQ_RAHEAD))
342                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
343         if (bio->bi_opf & REQ_PREFLUSH)
344                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
345 }
346 
347 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
348                                                 struct bio *bio)
349 {
350         if (bio->bi_opf & REQ_META)
351                 return fs_info->endio_meta_workers;
352         return fs_info->endio_workers;
353 }
354 
355 static void btrfs_end_bio_work(struct work_struct *work)
356 {
357         struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
358 
359         /* Metadata reads are checked and repaired by the submitter. */
360         if (is_data_bbio(bbio))
361                 btrfs_check_read_bio(bbio, bbio->bio.bi_private);
362         else
363                 btrfs_orig_bbio_end_io(bbio);
364 }
365 
366 static void btrfs_simple_end_io(struct bio *bio)
367 {
368         struct btrfs_bio *bbio = btrfs_bio(bio);
369         struct btrfs_device *dev = bio->bi_private;
370         struct btrfs_fs_info *fs_info = bbio->fs_info;
371 
372         btrfs_bio_counter_dec(fs_info);
373 
374         if (bio->bi_status)
375                 btrfs_log_dev_io_error(bio, dev);
376 
377         if (bio_op(bio) == REQ_OP_READ) {
378                 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
379                 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
380         } else {
381                 if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
382                         btrfs_record_physical_zoned(bbio);
383                 btrfs_orig_bbio_end_io(bbio);
384         }
385 }
386 
387 static void btrfs_raid56_end_io(struct bio *bio)
388 {
389         struct btrfs_io_context *bioc = bio->bi_private;
390         struct btrfs_bio *bbio = btrfs_bio(bio);
391 
392         btrfs_bio_counter_dec(bioc->fs_info);
393         bbio->mirror_num = bioc->mirror_num;
394         if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
395                 btrfs_check_read_bio(bbio, NULL);
396         else
397                 btrfs_orig_bbio_end_io(bbio);
398 
399         btrfs_put_bioc(bioc);
400 }
401 
402 static void btrfs_orig_write_end_io(struct bio *bio)
403 {
404         struct btrfs_io_stripe *stripe = bio->bi_private;
405         struct btrfs_io_context *bioc = stripe->bioc;
406         struct btrfs_bio *bbio = btrfs_bio(bio);
407 
408         btrfs_bio_counter_dec(bioc->fs_info);
409 
410         if (bio->bi_status) {
411                 atomic_inc(&bioc->error);
412                 btrfs_log_dev_io_error(bio, stripe->dev);
413         }
414 
415         /*
416          * Only send an error to the higher layers if it is beyond the tolerance
417          * threshold.
418          */
419         if (atomic_read(&bioc->error) > bioc->max_errors)
420                 bio->bi_status = BLK_STS_IOERR;
421         else
422                 bio->bi_status = BLK_STS_OK;
423 
424         if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
425                 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
426 
427         btrfs_orig_bbio_end_io(bbio);
428         btrfs_put_bioc(bioc);
429 }
430 
431 static void btrfs_clone_write_end_io(struct bio *bio)
432 {
433         struct btrfs_io_stripe *stripe = bio->bi_private;
434 
435         if (bio->bi_status) {
436                 atomic_inc(&stripe->bioc->error);
437                 btrfs_log_dev_io_error(bio, stripe->dev);
438         } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
439                 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
440         }
441 
442         /* Pass on control to the original bio this one was cloned from */
443         bio_endio(stripe->bioc->orig_bio);
444         bio_put(bio);
445 }
446 
447 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
448 {
449         if (!dev || !dev->bdev ||
450             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
451             (btrfs_op(bio) == BTRFS_MAP_WRITE &&
452              !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
453                 bio_io_error(bio);
454                 return;
455         }
456 
457         bio_set_dev(bio, dev->bdev);
458 
459         /*
460          * For zone append writing, bi_sector must point the beginning of the
461          * zone
462          */
463         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
464                 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
465                 u64 zone_start = round_down(physical, dev->fs_info->zone_size);
466 
467                 ASSERT(btrfs_dev_is_sequential(dev, physical));
468                 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
469         }
470         btrfs_debug_in_rcu(dev->fs_info,
471         "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
472                 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
473                 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
474                 dev->devid, bio->bi_iter.bi_size);
475 
476         if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
477                 blkcg_punt_bio_submit(bio);
478         else
479                 submit_bio(bio);
480 }
481 
482 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
483 {
484         struct bio *orig_bio = bioc->orig_bio, *bio;
485 
486         ASSERT(bio_op(orig_bio) != REQ_OP_READ);
487 
488         /* Reuse the bio embedded into the btrfs_bio for the last mirror */
489         if (dev_nr == bioc->num_stripes - 1) {
490                 bio = orig_bio;
491                 bio->bi_end_io = btrfs_orig_write_end_io;
492         } else {
493                 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
494                 bio_inc_remaining(orig_bio);
495                 bio->bi_end_io = btrfs_clone_write_end_io;
496         }
497 
498         bio->bi_private = &bioc->stripes[dev_nr];
499         bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
500         bioc->stripes[dev_nr].bioc = bioc;
501         bioc->size = bio->bi_iter.bi_size;
502         btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
503 }
504 
505 static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
506                                struct btrfs_io_stripe *smap, int mirror_num)
507 {
508         if (!bioc) {
509                 /* Single mirror read/write fast path. */
510                 btrfs_bio(bio)->mirror_num = mirror_num;
511                 bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
512                 if (bio_op(bio) != REQ_OP_READ)
513                         btrfs_bio(bio)->orig_physical = smap->physical;
514                 bio->bi_private = smap->dev;
515                 bio->bi_end_io = btrfs_simple_end_io;
516                 btrfs_submit_dev_bio(smap->dev, bio);
517         } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
518                 /* Parity RAID write or read recovery. */
519                 bio->bi_private = bioc;
520                 bio->bi_end_io = btrfs_raid56_end_io;
521                 if (bio_op(bio) == REQ_OP_READ)
522                         raid56_parity_recover(bio, bioc, mirror_num);
523                 else
524                         raid56_parity_write(bio, bioc);
525         } else {
526                 /* Write to multiple mirrors. */
527                 int total_devs = bioc->num_stripes;
528 
529                 bioc->orig_bio = bio;
530                 for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
531                         btrfs_submit_mirrored_bio(bioc, dev_nr);
532         }
533 }
534 
535 static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
536 {
537         if (bbio->bio.bi_opf & REQ_META)
538                 return btree_csum_one_bio(bbio);
539         return btrfs_csum_one_bio(bbio);
540 }
541 
542 /*
543  * Async submit bios are used to offload expensive checksumming onto the worker
544  * threads.
545  */
546 struct async_submit_bio {
547         struct btrfs_bio *bbio;
548         struct btrfs_io_context *bioc;
549         struct btrfs_io_stripe smap;
550         int mirror_num;
551         struct btrfs_work work;
552 };
553 
554 /*
555  * In order to insert checksums into the metadata in large chunks, we wait
556  * until bio submission time.   All the pages in the bio are checksummed and
557  * sums are attached onto the ordered extent record.
558  *
559  * At IO completion time the csums attached on the ordered extent record are
560  * inserted into the btree.
561  */
562 static void run_one_async_start(struct btrfs_work *work)
563 {
564         struct async_submit_bio *async =
565                 container_of(work, struct async_submit_bio, work);
566         blk_status_t ret;
567 
568         ret = btrfs_bio_csum(async->bbio);
569         if (ret)
570                 async->bbio->bio.bi_status = ret;
571 }
572 
573 /*
574  * In order to insert checksums into the metadata in large chunks, we wait
575  * until bio submission time.   All the pages in the bio are checksummed and
576  * sums are attached onto the ordered extent record.
577  *
578  * At IO completion time the csums attached on the ordered extent record are
579  * inserted into the tree.
580  *
581  * If called with @do_free == true, then it will free the work struct.
582  */
583 static void run_one_async_done(struct btrfs_work *work, bool do_free)
584 {
585         struct async_submit_bio *async =
586                 container_of(work, struct async_submit_bio, work);
587         struct bio *bio = &async->bbio->bio;
588 
589         if (do_free) {
590                 kfree(container_of(work, struct async_submit_bio, work));
591                 return;
592         }
593 
594         /* If an error occurred we just want to clean up the bio and move on. */
595         if (bio->bi_status) {
596                 btrfs_orig_bbio_end_io(async->bbio);
597                 return;
598         }
599 
600         /*
601          * All of the bios that pass through here are from async helpers.
602          * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
603          * context.  This changes nothing when cgroups aren't in use.
604          */
605         bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
606         __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
607 }
608 
609 static bool should_async_write(struct btrfs_bio *bbio)
610 {
611         bool auto_csum_mode = true;
612 
613 #ifdef CONFIG_BTRFS_DEBUG
614         struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
615         enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
616 
617         if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
618                 return false;
619 
620         auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
621 #endif
622 
623         /* Submit synchronously if the checksum implementation is fast. */
624         if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
625                 return false;
626 
627         /*
628          * Try to defer the submission to a workqueue to parallelize the
629          * checksum calculation unless the I/O is issued synchronously.
630          */
631         if (op_is_sync(bbio->bio.bi_opf))
632                 return false;
633 
634         /* Zoned devices require I/O to be submitted in order. */
635         if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
636                 return false;
637 
638         return true;
639 }
640 
641 /*
642  * Submit bio to an async queue.
643  *
644  * Return true if the work has been successfully submitted, else false.
645  */
646 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
647                                 struct btrfs_io_context *bioc,
648                                 struct btrfs_io_stripe *smap, int mirror_num)
649 {
650         struct btrfs_fs_info *fs_info = bbio->fs_info;
651         struct async_submit_bio *async;
652 
653         async = kmalloc(sizeof(*async), GFP_NOFS);
654         if (!async)
655                 return false;
656 
657         async->bbio = bbio;
658         async->bioc = bioc;
659         async->smap = *smap;
660         async->mirror_num = mirror_num;
661 
662         btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
663         btrfs_queue_work(fs_info->workers, &async->work);
664         return true;
665 }
666 
667 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
668 {
669         struct btrfs_inode *inode = bbio->inode;
670         struct btrfs_fs_info *fs_info = bbio->fs_info;
671         struct btrfs_bio *orig_bbio = bbio;
672         struct bio *bio = &bbio->bio;
673         u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
674         u64 length = bio->bi_iter.bi_size;
675         u64 map_length = length;
676         bool use_append = btrfs_use_zone_append(bbio);
677         struct btrfs_io_context *bioc = NULL;
678         struct btrfs_io_stripe smap;
679         blk_status_t ret;
680         int error;
681 
682         smap.is_scrub = !bbio->inode;
683 
684         btrfs_bio_counter_inc_blocked(fs_info);
685         error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
686                                 &bioc, &smap, &mirror_num);
687         if (error) {
688                 ret = errno_to_blk_status(error);
689                 goto fail;
690         }
691 
692         map_length = min(map_length, length);
693         if (use_append)
694                 map_length = min(map_length, fs_info->max_zone_append_size);
695 
696         if (map_length < length) {
697                 bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
698                 bio = &bbio->bio;
699         }
700 
701         /*
702          * Save the iter for the end_io handler and preload the checksums for
703          * data reads.
704          */
705         if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
706                 bbio->saved_iter = bio->bi_iter;
707                 ret = btrfs_lookup_bio_sums(bbio);
708                 if (ret)
709                         goto fail_put_bio;
710         }
711 
712         if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
713                 if (use_append) {
714                         bio->bi_opf &= ~REQ_OP_WRITE;
715                         bio->bi_opf |= REQ_OP_ZONE_APPEND;
716                 }
717 
718                 if (is_data_bbio(bbio) && bioc &&
719                     btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
720                         /*
721                          * No locking for the list update, as we only add to
722                          * the list in the I/O submission path, and list
723                          * iteration only happens in the completion path, which
724                          * can't happen until after the last submission.
725                          */
726                         btrfs_get_bioc(bioc);
727                         list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
728                 }
729 
730                 /*
731                  * Csum items for reloc roots have already been cloned at this
732                  * point, so they are handled as part of the no-checksum case.
733                  */
734                 if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
735                     !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
736                     !btrfs_is_data_reloc_root(inode->root)) {
737                         if (should_async_write(bbio) &&
738                             btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
739                                 goto done;
740 
741                         ret = btrfs_bio_csum(bbio);
742                         if (ret)
743                                 goto fail_put_bio;
744                 } else if (use_append ||
745                            (btrfs_is_zoned(fs_info) && inode &&
746                             inode->flags & BTRFS_INODE_NODATASUM)) {
747                         ret = btrfs_alloc_dummy_sum(bbio);
748                         if (ret)
749                                 goto fail_put_bio;
750                 }
751         }
752 
753         __btrfs_submit_bio(bio, bioc, &smap, mirror_num);
754 done:
755         return map_length == length;
756 
757 fail_put_bio:
758         if (map_length < length)
759                 btrfs_cleanup_bio(bbio);
760 fail:
761         btrfs_bio_counter_dec(fs_info);
762         btrfs_bio_end_io(orig_bbio, ret);
763         /* Do not submit another chunk */
764         return true;
765 }
766 
767 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
768 {
769         /* If bbio->inode is not populated, its file_offset must be 0. */
770         ASSERT(bbio->inode || bbio->file_offset == 0);
771 
772         while (!btrfs_submit_chunk(bbio, mirror_num))
773                 ;
774 }
775 
776 /*
777  * Submit a repair write.
778  *
779  * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
780  * RAID setup.  Here we only want to write the one bad copy, so we do the
781  * mapping ourselves and submit the bio directly.
782  *
783  * The I/O is issued synchronously to block the repair read completion from
784  * freeing the bio.
785  */
786 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
787                             u64 length, u64 logical, struct folio *folio,
788                             unsigned int folio_offset, int mirror_num)
789 {
790         struct btrfs_io_stripe smap = { 0 };
791         struct bio_vec bvec;
792         struct bio bio;
793         int ret = 0;
794 
795         ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
796         BUG_ON(!mirror_num);
797 
798         if (btrfs_repair_one_zone(fs_info, logical))
799                 return 0;
800 
801         /*
802          * Avoid races with device replace and make sure our bioc has devices
803          * associated to its stripes that don't go away while we are doing the
804          * read repair operation.
805          */
806         btrfs_bio_counter_inc_blocked(fs_info);
807         ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
808         if (ret < 0)
809                 goto out_counter_dec;
810 
811         if (!smap.dev->bdev ||
812             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
813                 ret = -EIO;
814                 goto out_counter_dec;
815         }
816 
817         bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
818         bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
819         ret = bio_add_folio(&bio, folio, length, folio_offset);
820         ASSERT(ret);
821         ret = submit_bio_wait(&bio);
822         if (ret) {
823                 /* try to remap that extent elsewhere? */
824                 btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
825                 goto out_bio_uninit;
826         }
827 
828         btrfs_info_rl_in_rcu(fs_info,
829                 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
830                              ino, start, btrfs_dev_name(smap.dev),
831                              smap.physical >> SECTOR_SHIFT);
832         ret = 0;
833 
834 out_bio_uninit:
835         bio_uninit(&bio);
836 out_counter_dec:
837         btrfs_bio_counter_dec(fs_info);
838         return ret;
839 }
840 
841 /*
842  * Submit a btrfs_bio based repair write.
843  *
844  * If @dev_replace is true, the write would be submitted to dev-replace target.
845  */
846 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
847 {
848         struct btrfs_fs_info *fs_info = bbio->fs_info;
849         u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
850         u64 length = bbio->bio.bi_iter.bi_size;
851         struct btrfs_io_stripe smap = { 0 };
852         int ret;
853 
854         ASSERT(fs_info);
855         ASSERT(mirror_num > 0);
856         ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
857         ASSERT(!bbio->inode);
858 
859         btrfs_bio_counter_inc_blocked(fs_info);
860         ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
861         if (ret < 0)
862                 goto fail;
863 
864         if (dev_replace) {
865                 ASSERT(smap.dev == fs_info->dev_replace.srcdev);
866                 smap.dev = fs_info->dev_replace.tgtdev;
867         }
868         __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
869         return;
870 
871 fail:
872         btrfs_bio_counter_dec(fs_info);
873         btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
874 }
875 
876 int __init btrfs_bioset_init(void)
877 {
878         if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
879                         offsetof(struct btrfs_bio, bio),
880                         BIOSET_NEED_BVECS))
881                 return -ENOMEM;
882         if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
883                         offsetof(struct btrfs_bio, bio), 0))
884                 goto out_free_bioset;
885         if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
886                         offsetof(struct btrfs_bio, bio),
887                         BIOSET_NEED_BVECS))
888                 goto out_free_clone_bioset;
889         if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
890                                       sizeof(struct btrfs_failed_bio)))
891                 goto out_free_repair_bioset;
892         return 0;
893 
894 out_free_repair_bioset:
895         bioset_exit(&btrfs_repair_bioset);
896 out_free_clone_bioset:
897         bioset_exit(&btrfs_clone_bioset);
898 out_free_bioset:
899         bioset_exit(&btrfs_bioset);
900         return -ENOMEM;
901 }
902 
903 void __cold btrfs_bioset_exit(void)
904 {
905         mempool_exit(&btrfs_failed_bio_pool);
906         bioset_exit(&btrfs_repair_bioset);
907         bioset_exit(&btrfs_clone_bioset);
908         bioset_exit(&btrfs_bioset);
909 }
910 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php