~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/direct-io.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include <linux/fsverity.h>
  4 #include <linux/iomap.h>
  5 #include "ctree.h"
  6 #include "delalloc-space.h"
  7 #include "direct-io.h"
  8 #include "extent-tree.h"
  9 #include "file.h"
 10 #include "fs.h"
 11 #include "transaction.h"
 12 #include "volumes.h"
 13 
 14 struct btrfs_dio_data {
 15         ssize_t submitted;
 16         struct extent_changeset *data_reserved;
 17         struct btrfs_ordered_extent *ordered;
 18         bool data_space_reserved;
 19         bool nocow_done;
 20 };
 21 
 22 struct btrfs_dio_private {
 23         /* Range of I/O */
 24         u64 file_offset;
 25         u32 bytes;
 26 
 27         /* This must be last */
 28         struct btrfs_bio bbio;
 29 };
 30 
 31 static struct bio_set btrfs_dio_bioset;
 32 
 33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 34                               struct extent_state **cached_state,
 35                               unsigned int iomap_flags)
 36 {
 37         const bool writing = (iomap_flags & IOMAP_WRITE);
 38         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
 39         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 40         struct btrfs_ordered_extent *ordered;
 41         int ret = 0;
 42 
 43         while (1) {
 44                 if (nowait) {
 45                         if (!try_lock_extent(io_tree, lockstart, lockend,
 46                                              cached_state))
 47                                 return -EAGAIN;
 48                 } else {
 49                         lock_extent(io_tree, lockstart, lockend, cached_state);
 50                 }
 51                 /*
 52                  * We're concerned with the entire range that we're going to be
 53                  * doing DIO to, so we need to make sure there's no ordered
 54                  * extents in this range.
 55                  */
 56                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
 57                                                      lockend - lockstart + 1);
 58 
 59                 /*
 60                  * We need to make sure there are no buffered pages in this
 61                  * range either, we could have raced between the invalidate in
 62                  * generic_file_direct_write and locking the extent.  The
 63                  * invalidate needs to happen so that reads after a write do not
 64                  * get stale data.
 65                  */
 66                 if (!ordered &&
 67                     (!writing || !filemap_range_has_page(inode->i_mapping,
 68                                                          lockstart, lockend)))
 69                         break;
 70 
 71                 unlock_extent(io_tree, lockstart, lockend, cached_state);
 72 
 73                 if (ordered) {
 74                         if (nowait) {
 75                                 btrfs_put_ordered_extent(ordered);
 76                                 ret = -EAGAIN;
 77                                 break;
 78                         }
 79                         /*
 80                          * If we are doing a DIO read and the ordered extent we
 81                          * found is for a buffered write, we can not wait for it
 82                          * to complete and retry, because if we do so we can
 83                          * deadlock with concurrent buffered writes on page
 84                          * locks. This happens only if our DIO read covers more
 85                          * than one extent map, if at this point has already
 86                          * created an ordered extent for a previous extent map
 87                          * and locked its range in the inode's io tree, and a
 88                          * concurrent write against that previous extent map's
 89                          * range and this range started (we unlock the ranges
 90                          * in the io tree only when the bios complete and
 91                          * buffered writes always lock pages before attempting
 92                          * to lock range in the io tree).
 93                          */
 94                         if (writing ||
 95                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
 96                                 btrfs_start_ordered_extent(ordered);
 97                         else
 98                                 ret = nowait ? -EAGAIN : -ENOTBLK;
 99                         btrfs_put_ordered_extent(ordered);
100                 } else {
101                         /*
102                          * We could trigger writeback for this range (and wait
103                          * for it to complete) and then invalidate the pages for
104                          * this range (through invalidate_inode_pages2_range()),
105                          * but that can lead us to a deadlock with a concurrent
106                          * call to readahead (a buffered read or a defrag call
107                          * triggered a readahead) on a page lock due to an
108                          * ordered dio extent we created before but did not have
109                          * yet a corresponding bio submitted (whence it can not
110                          * complete), which makes readahead wait for that
111                          * ordered extent to complete while holding a lock on
112                          * that page.
113                          */
114                         ret = nowait ? -EAGAIN : -ENOTBLK;
115                 }
116 
117                 if (ret)
118                         break;
119 
120                 cond_resched();
121         }
122 
123         return ret;
124 }
125 
126 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
127                                                   struct btrfs_dio_data *dio_data,
128                                                   const u64 start,
129                                                   const struct btrfs_file_extent *file_extent,
130                                                   const int type)
131 {
132         struct extent_map *em = NULL;
133         struct btrfs_ordered_extent *ordered;
134 
135         if (type != BTRFS_ORDERED_NOCOW) {
136                 em = btrfs_create_io_em(inode, start, file_extent, type);
137                 if (IS_ERR(em))
138                         goto out;
139         }
140 
141         ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
142                                              (1 << type) |
143                                              (1 << BTRFS_ORDERED_DIRECT));
144         if (IS_ERR(ordered)) {
145                 if (em) {
146                         free_extent_map(em);
147                         btrfs_drop_extent_map_range(inode, start,
148                                         start + file_extent->num_bytes - 1, false);
149                 }
150                 em = ERR_CAST(ordered);
151         } else {
152                 ASSERT(!dio_data->ordered);
153                 dio_data->ordered = ordered;
154         }
155  out:
156 
157         return em;
158 }
159 
160 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
161                                                   struct btrfs_dio_data *dio_data,
162                                                   u64 start, u64 len)
163 {
164         struct btrfs_root *root = inode->root;
165         struct btrfs_fs_info *fs_info = root->fs_info;
166         struct btrfs_file_extent file_extent;
167         struct extent_map *em;
168         struct btrfs_key ins;
169         u64 alloc_hint;
170         int ret;
171 
172         alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
173 again:
174         ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
175                                    0, alloc_hint, &ins, 1, 1);
176         if (ret == -EAGAIN) {
177                 ASSERT(btrfs_is_zoned(fs_info));
178                 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
179                                TASK_UNINTERRUPTIBLE);
180                 goto again;
181         }
182         if (ret)
183                 return ERR_PTR(ret);
184 
185         file_extent.disk_bytenr = ins.objectid;
186         file_extent.disk_num_bytes = ins.offset;
187         file_extent.num_bytes = ins.offset;
188         file_extent.ram_bytes = ins.offset;
189         file_extent.offset = 0;
190         file_extent.compression = BTRFS_COMPRESS_NONE;
191         em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
192                                      BTRFS_ORDERED_REGULAR);
193         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
194         if (IS_ERR(em))
195                 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
196                                            1);
197 
198         return em;
199 }
200 
201 static int btrfs_get_blocks_direct_write(struct extent_map **map,
202                                          struct inode *inode,
203                                          struct btrfs_dio_data *dio_data,
204                                          u64 start, u64 *lenp,
205                                          unsigned int iomap_flags)
206 {
207         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
208         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
209         struct btrfs_file_extent file_extent;
210         struct extent_map *em = *map;
211         int type;
212         u64 block_start;
213         struct btrfs_block_group *bg;
214         bool can_nocow = false;
215         bool space_reserved = false;
216         u64 len = *lenp;
217         u64 prev_len;
218         int ret = 0;
219 
220         /*
221          * We don't allocate a new extent in the following cases
222          *
223          * 1) The inode is marked as NODATACOW. In this case we'll just use the
224          * existing extent.
225          * 2) The extent is marked as PREALLOC. We're good to go here and can
226          * just use the extent.
227          *
228          */
229         if ((em->flags & EXTENT_FLAG_PREALLOC) ||
230             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
231              em->disk_bytenr != EXTENT_MAP_HOLE)) {
232                 if (em->flags & EXTENT_FLAG_PREALLOC)
233                         type = BTRFS_ORDERED_PREALLOC;
234                 else
235                         type = BTRFS_ORDERED_NOCOW;
236                 len = min(len, em->len - (start - em->start));
237                 block_start = extent_map_block_start(em) + (start - em->start);
238 
239                 if (can_nocow_extent(inode, start, &len,
240                                      &file_extent, false, false) == 1) {
241                         bg = btrfs_inc_nocow_writers(fs_info, block_start);
242                         if (bg)
243                                 can_nocow = true;
244                 }
245         }
246 
247         prev_len = len;
248         if (can_nocow) {
249                 struct extent_map *em2;
250 
251                 /* We can NOCOW, so only need to reserve metadata space. */
252                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
253                                                       nowait);
254                 if (ret < 0) {
255                         /* Our caller expects us to free the input extent map. */
256                         free_extent_map(em);
257                         *map = NULL;
258                         btrfs_dec_nocow_writers(bg);
259                         if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
260                                 ret = -EAGAIN;
261                         goto out;
262                 }
263                 space_reserved = true;
264 
265                 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
266                                               &file_extent, type);
267                 btrfs_dec_nocow_writers(bg);
268                 if (type == BTRFS_ORDERED_PREALLOC) {
269                         free_extent_map(em);
270                         *map = em2;
271                         em = em2;
272                 }
273 
274                 if (IS_ERR(em2)) {
275                         ret = PTR_ERR(em2);
276                         goto out;
277                 }
278 
279                 dio_data->nocow_done = true;
280         } else {
281                 /* Our caller expects us to free the input extent map. */
282                 free_extent_map(em);
283                 *map = NULL;
284 
285                 if (nowait) {
286                         ret = -EAGAIN;
287                         goto out;
288                 }
289 
290                 /*
291                  * If we could not allocate data space before locking the file
292                  * range and we can't do a NOCOW write, then we have to fail.
293                  */
294                 if (!dio_data->data_space_reserved) {
295                         ret = -ENOSPC;
296                         goto out;
297                 }
298 
299                 /*
300                  * We have to COW and we have already reserved data space before,
301                  * so now we reserve only metadata.
302                  */
303                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
304                                                       false);
305                 if (ret < 0)
306                         goto out;
307                 space_reserved = true;
308 
309                 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
310                 if (IS_ERR(em)) {
311                         ret = PTR_ERR(em);
312                         goto out;
313                 }
314                 *map = em;
315                 len = min(len, em->len - (start - em->start));
316                 if (len < prev_len)
317                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
318                                                         prev_len - len, true);
319         }
320 
321         /*
322          * We have created our ordered extent, so we can now release our reservation
323          * for an outstanding extent.
324          */
325         btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
326 
327         /*
328          * Need to update the i_size under the extent lock so buffered
329          * readers will get the updated i_size when we unlock.
330          */
331         if (start + len > i_size_read(inode))
332                 i_size_write(inode, start + len);
333 out:
334         if (ret && space_reserved) {
335                 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
336                 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
337         }
338         *lenp = len;
339         return ret;
340 }
341 
342 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
343                 loff_t length, unsigned int flags, struct iomap *iomap,
344                 struct iomap *srcmap)
345 {
346         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
347         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
348         struct extent_map *em;
349         struct extent_state *cached_state = NULL;
350         struct btrfs_dio_data *dio_data = iter->private;
351         u64 lockstart, lockend;
352         const bool write = !!(flags & IOMAP_WRITE);
353         int ret = 0;
354         u64 len = length;
355         const u64 data_alloc_len = length;
356         bool unlock_extents = false;
357 
358         /*
359          * We could potentially fault if we have a buffer > PAGE_SIZE, and if
360          * we're NOWAIT we may submit a bio for a partial range and return
361          * EIOCBQUEUED, which would result in an errant short read.
362          *
363          * The best way to handle this would be to allow for partial completions
364          * of iocb's, so we could submit the partial bio, return and fault in
365          * the rest of the pages, and then submit the io for the rest of the
366          * range.  However we don't have that currently, so simply return
367          * -EAGAIN at this point so that the normal path is used.
368          */
369         if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
370                 return -EAGAIN;
371 
372         /*
373          * Cap the size of reads to that usually seen in buffered I/O as we need
374          * to allocate a contiguous array for the checksums.
375          */
376         if (!write)
377                 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
378 
379         lockstart = start;
380         lockend = start + len - 1;
381 
382         /*
383          * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
384          * enough if we've written compressed pages to this area, so we need to
385          * flush the dirty pages again to make absolutely sure that any
386          * outstanding dirty pages are on disk - the first flush only starts
387          * compression on the data, while keeping the pages locked, so by the
388          * time the second flush returns we know bios for the compressed pages
389          * were submitted and finished, and the pages no longer under writeback.
390          *
391          * If we have a NOWAIT request and we have any pages in the range that
392          * are locked, likely due to compression still in progress, we don't want
393          * to block on page locks. We also don't want to block on pages marked as
394          * dirty or under writeback (same as for the non-compression case).
395          * iomap_dio_rw() did the same check, but after that and before we got
396          * here, mmap'ed writes may have happened or buffered reads started
397          * (readpage() and readahead(), which lock pages), as we haven't locked
398          * the file range yet.
399          */
400         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
401                      &BTRFS_I(inode)->runtime_flags)) {
402                 if (flags & IOMAP_NOWAIT) {
403                         if (filemap_range_needs_writeback(inode->i_mapping,
404                                                           lockstart, lockend))
405                                 return -EAGAIN;
406                 } else {
407                         ret = filemap_fdatawrite_range(inode->i_mapping, start,
408                                                        start + length - 1);
409                         if (ret)
410                                 return ret;
411                 }
412         }
413 
414         memset(dio_data, 0, sizeof(*dio_data));
415 
416         /*
417          * We always try to allocate data space and must do it before locking
418          * the file range, to avoid deadlocks with concurrent writes to the same
419          * range if the range has several extents and the writes don't expand the
420          * current i_size (the inode lock is taken in shared mode). If we fail to
421          * allocate data space here we continue and later, after locking the
422          * file range, we fail with ENOSPC only if we figure out we can not do a
423          * NOCOW write.
424          */
425         if (write && !(flags & IOMAP_NOWAIT)) {
426                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
427                                                   &dio_data->data_reserved,
428                                                   start, data_alloc_len, false);
429                 if (!ret)
430                         dio_data->data_space_reserved = true;
431                 else if (ret && !(BTRFS_I(inode)->flags &
432                                   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
433                         goto err;
434         }
435 
436         /*
437          * If this errors out it's because we couldn't invalidate pagecache for
438          * this range and we need to fallback to buffered IO, or we are doing a
439          * NOWAIT read/write and we need to block.
440          */
441         ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
442         if (ret < 0)
443                 goto err;
444 
445         em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
446         if (IS_ERR(em)) {
447                 ret = PTR_ERR(em);
448                 goto unlock_err;
449         }
450 
451         /*
452          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
453          * io.  INLINE is special, and we could probably kludge it in here, but
454          * it's still buffered so for safety lets just fall back to the generic
455          * buffered path.
456          *
457          * For COMPRESSED we _have_ to read the entire extent in so we can
458          * decompress it, so there will be buffering required no matter what we
459          * do, so go ahead and fallback to buffered.
460          *
461          * We return -ENOTBLK because that's what makes DIO go ahead and go back
462          * to buffered IO.  Don't blame me, this is the price we pay for using
463          * the generic code.
464          */
465         if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
466                 free_extent_map(em);
467                 /*
468                  * If we are in a NOWAIT context, return -EAGAIN in order to
469                  * fallback to buffered IO. This is not only because we can
470                  * block with buffered IO (no support for NOWAIT semantics at
471                  * the moment) but also to avoid returning short reads to user
472                  * space - this happens if we were able to read some data from
473                  * previous non-compressed extents and then when we fallback to
474                  * buffered IO, at btrfs_file_read_iter() by calling
475                  * filemap_read(), we fail to fault in pages for the read buffer,
476                  * in which case filemap_read() returns a short read (the number
477                  * of bytes previously read is > 0, so it does not return -EFAULT).
478                  */
479                 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
480                 goto unlock_err;
481         }
482 
483         len = min(len, em->len - (start - em->start));
484 
485         /*
486          * If we have a NOWAIT request and the range contains multiple extents
487          * (or a mix of extents and holes), then we return -EAGAIN to make the
488          * caller fallback to a context where it can do a blocking (without
489          * NOWAIT) request. This way we avoid doing partial IO and returning
490          * success to the caller, which is not optimal for writes and for reads
491          * it can result in unexpected behaviour for an application.
492          *
493          * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
494          * iomap_dio_rw(), we can end up returning less data then what the caller
495          * asked for, resulting in an unexpected, and incorrect, short read.
496          * That is, the caller asked to read N bytes and we return less than that,
497          * which is wrong unless we are crossing EOF. This happens if we get a
498          * page fault error when trying to fault in pages for the buffer that is
499          * associated to the struct iov_iter passed to iomap_dio_rw(), and we
500          * have previously submitted bios for other extents in the range, in
501          * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
502          * those bios have completed by the time we get the page fault error,
503          * which we return back to our caller - we should only return EIOCBQUEUED
504          * after we have submitted bios for all the extents in the range.
505          */
506         if ((flags & IOMAP_NOWAIT) && len < length) {
507                 free_extent_map(em);
508                 ret = -EAGAIN;
509                 goto unlock_err;
510         }
511 
512         if (write) {
513                 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
514                                                     start, &len, flags);
515                 if (ret < 0)
516                         goto unlock_err;
517                 unlock_extents = true;
518                 /* Recalc len in case the new em is smaller than requested */
519                 len = min(len, em->len - (start - em->start));
520                 if (dio_data->data_space_reserved) {
521                         u64 release_offset;
522                         u64 release_len = 0;
523 
524                         if (dio_data->nocow_done) {
525                                 release_offset = start;
526                                 release_len = data_alloc_len;
527                         } else if (len < data_alloc_len) {
528                                 release_offset = start + len;
529                                 release_len = data_alloc_len - len;
530                         }
531 
532                         if (release_len > 0)
533                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
534                                                                dio_data->data_reserved,
535                                                                release_offset,
536                                                                release_len);
537                 }
538         } else {
539                 /*
540                  * We need to unlock only the end area that we aren't using.
541                  * The rest is going to be unlocked by the endio routine.
542                  */
543                 lockstart = start + len;
544                 if (lockstart < lockend)
545                         unlock_extents = true;
546         }
547 
548         if (unlock_extents)
549                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
550                               &cached_state);
551         else
552                 free_extent_state(cached_state);
553 
554         /*
555          * Translate extent map information to iomap.
556          * We trim the extents (and move the addr) even though iomap code does
557          * that, since we have locked only the parts we are performing I/O in.
558          */
559         if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
560             ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
561                 iomap->addr = IOMAP_NULL_ADDR;
562                 iomap->type = IOMAP_HOLE;
563         } else {
564                 iomap->addr = extent_map_block_start(em) + (start - em->start);
565                 iomap->type = IOMAP_MAPPED;
566         }
567         iomap->offset = start;
568         iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
569         iomap->length = len;
570         free_extent_map(em);
571 
572         return 0;
573 
574 unlock_err:
575         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
576                       &cached_state);
577 err:
578         if (dio_data->data_space_reserved) {
579                 btrfs_free_reserved_data_space(BTRFS_I(inode),
580                                                dio_data->data_reserved,
581                                                start, data_alloc_len);
582                 extent_changeset_free(dio_data->data_reserved);
583         }
584 
585         return ret;
586 }
587 
588 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
589                 ssize_t written, unsigned int flags, struct iomap *iomap)
590 {
591         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
592         struct btrfs_dio_data *dio_data = iter->private;
593         size_t submitted = dio_data->submitted;
594         const bool write = !!(flags & IOMAP_WRITE);
595         int ret = 0;
596 
597         if (!write && (iomap->type == IOMAP_HOLE)) {
598                 /* If reading from a hole, unlock and return */
599                 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
600                               NULL);
601                 return 0;
602         }
603 
604         if (submitted < length) {
605                 pos += submitted;
606                 length -= submitted;
607                 if (write)
608                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
609                                                     pos, length, false);
610                 else
611                         unlock_extent(&BTRFS_I(inode)->io_tree, pos,
612                                       pos + length - 1, NULL);
613                 ret = -ENOTBLK;
614         }
615         if (write) {
616                 btrfs_put_ordered_extent(dio_data->ordered);
617                 dio_data->ordered = NULL;
618         }
619 
620         if (write)
621                 extent_changeset_free(dio_data->data_reserved);
622         return ret;
623 }
624 
625 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
626 {
627         struct btrfs_dio_private *dip =
628                 container_of(bbio, struct btrfs_dio_private, bbio);
629         struct btrfs_inode *inode = bbio->inode;
630         struct bio *bio = &bbio->bio;
631 
632         if (bio->bi_status) {
633                 btrfs_warn(inode->root->fs_info,
634                 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
635                            btrfs_ino(inode), bio->bi_opf,
636                            dip->file_offset, dip->bytes, bio->bi_status);
637         }
638 
639         if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
640                 btrfs_finish_ordered_extent(bbio->ordered, NULL,
641                                             dip->file_offset, dip->bytes,
642                                             !bio->bi_status);
643         } else {
644                 unlock_extent(&inode->io_tree, dip->file_offset,
645                               dip->file_offset + dip->bytes - 1, NULL);
646         }
647 
648         bbio->bio.bi_private = bbio->private;
649         iomap_dio_bio_end_io(bio);
650 }
651 
652 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
653                                         struct btrfs_ordered_extent *ordered)
654 {
655         u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
656         u64 len = bbio->bio.bi_iter.bi_size;
657         struct btrfs_ordered_extent *new;
658         int ret;
659 
660         /* Must always be called for the beginning of an ordered extent. */
661         if (WARN_ON_ONCE(start != ordered->disk_bytenr))
662                 return -EINVAL;
663 
664         /* No need to split if the ordered extent covers the entire bio. */
665         if (ordered->disk_num_bytes == len) {
666                 refcount_inc(&ordered->refs);
667                 bbio->ordered = ordered;
668                 return 0;
669         }
670 
671         /*
672          * Don't split the extent_map for NOCOW extents, as we're writing into
673          * a pre-existing one.
674          */
675         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
676                 ret = split_extent_map(bbio->inode, bbio->file_offset,
677                                        ordered->num_bytes, len,
678                                        ordered->disk_bytenr);
679                 if (ret)
680                         return ret;
681         }
682 
683         new = btrfs_split_ordered_extent(ordered, len);
684         if (IS_ERR(new))
685                 return PTR_ERR(new);
686         bbio->ordered = new;
687         return 0;
688 }
689 
690 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
691                                 loff_t file_offset)
692 {
693         struct btrfs_bio *bbio = btrfs_bio(bio);
694         struct btrfs_dio_private *dip =
695                 container_of(bbio, struct btrfs_dio_private, bbio);
696         struct btrfs_dio_data *dio_data = iter->private;
697 
698         btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
699                        btrfs_dio_end_io, bio->bi_private);
700         bbio->inode = BTRFS_I(iter->inode);
701         bbio->file_offset = file_offset;
702 
703         dip->file_offset = file_offset;
704         dip->bytes = bio->bi_iter.bi_size;
705 
706         dio_data->submitted += bio->bi_iter.bi_size;
707 
708         /*
709          * Check if we are doing a partial write.  If we are, we need to split
710          * the ordered extent to match the submitted bio.  Hang on to the
711          * remaining unfinishable ordered_extent in dio_data so that it can be
712          * cancelled in iomap_end to avoid a deadlock wherein faulting the
713          * remaining pages is blocked on the outstanding ordered extent.
714          */
715         if (iter->flags & IOMAP_WRITE) {
716                 int ret;
717 
718                 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
719                 if (ret) {
720                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
721                                                     file_offset, dip->bytes,
722                                                     !ret);
723                         bio->bi_status = errno_to_blk_status(ret);
724                         iomap_dio_bio_end_io(bio);
725                         return;
726                 }
727         }
728 
729         btrfs_submit_bio(bbio, 0);
730 }
731 
732 static const struct iomap_ops btrfs_dio_iomap_ops = {
733         .iomap_begin            = btrfs_dio_iomap_begin,
734         .iomap_end              = btrfs_dio_iomap_end,
735 };
736 
737 static const struct iomap_dio_ops btrfs_dio_ops = {
738         .submit_io              = btrfs_dio_submit_io,
739         .bio_set                = &btrfs_dio_bioset,
740 };
741 
742 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
743                               size_t done_before)
744 {
745         struct btrfs_dio_data data = { 0 };
746 
747         return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
748                             IOMAP_DIO_PARTIAL, &data, done_before);
749 }
750 
751 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
752                                          size_t done_before)
753 {
754         struct btrfs_dio_data data = { 0 };
755 
756         return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
757                             IOMAP_DIO_PARTIAL, &data, done_before);
758 }
759 
760 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
761                                const struct iov_iter *iter, loff_t offset)
762 {
763         const u32 blocksize_mask = fs_info->sectorsize - 1;
764 
765         if (offset & blocksize_mask)
766                 return -EINVAL;
767 
768         if (iov_iter_alignment(iter) & blocksize_mask)
769                 return -EINVAL;
770 
771         return 0;
772 }
773 
774 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
775 {
776         struct file *file = iocb->ki_filp;
777         struct inode *inode = file_inode(file);
778         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
779         loff_t pos;
780         ssize_t written = 0;
781         ssize_t written_buffered;
782         size_t prev_left = 0;
783         loff_t endbyte;
784         ssize_t ret;
785         unsigned int ilock_flags = 0;
786         struct iomap_dio *dio;
787 
788         if (iocb->ki_flags & IOCB_NOWAIT)
789                 ilock_flags |= BTRFS_ILOCK_TRY;
790 
791         /*
792          * If the write DIO is within EOF, use a shared lock and also only if
793          * security bits will likely not be dropped by file_remove_privs() called
794          * from btrfs_write_check(). Either will need to be rechecked after the
795          * lock was acquired.
796          */
797         if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
798                 ilock_flags |= BTRFS_ILOCK_SHARED;
799 
800 relock:
801         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
802         if (ret < 0)
803                 return ret;
804 
805         /* Shared lock cannot be used with security bits set. */
806         if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
807                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
808                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
809                 goto relock;
810         }
811 
812         ret = generic_write_checks(iocb, from);
813         if (ret <= 0) {
814                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
815                 return ret;
816         }
817 
818         ret = btrfs_write_check(iocb, from, ret);
819         if (ret < 0) {
820                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
821                 goto out;
822         }
823 
824         pos = iocb->ki_pos;
825         /*
826          * Re-check since file size may have changed just before taking the
827          * lock or pos may have changed because of O_APPEND in generic_write_check()
828          */
829         if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
830             pos + iov_iter_count(from) > i_size_read(inode)) {
831                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
832                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
833                 goto relock;
834         }
835 
836         if (check_direct_IO(fs_info, from, pos)) {
837                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
838                 goto buffered;
839         }
840 
841         /*
842          * The iov_iter can be mapped to the same file range we are writing to.
843          * If that's the case, then we will deadlock in the iomap code, because
844          * it first calls our callback btrfs_dio_iomap_begin(), which will create
845          * an ordered extent, and after that it will fault in the pages that the
846          * iov_iter refers to. During the fault in we end up in the readahead
847          * pages code (starting at btrfs_readahead()), which will lock the range,
848          * find that ordered extent and then wait for it to complete (at
849          * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
850          * obviously the ordered extent can never complete as we didn't submit
851          * yet the respective bio(s). This always happens when the buffer is
852          * memory mapped to the same file range, since the iomap DIO code always
853          * invalidates pages in the target file range (after starting and waiting
854          * for any writeback).
855          *
856          * So here we disable page faults in the iov_iter and then retry if we
857          * got -EFAULT, faulting in the pages before the retry.
858          */
859 again:
860         from->nofault = true;
861         dio = btrfs_dio_write(iocb, from, written);
862         from->nofault = false;
863 
864         if (IS_ERR_OR_NULL(dio)) {
865                 ret = PTR_ERR_OR_ZERO(dio);
866         } else {
867                 /*
868                  * If we have a synchronous write, we must make sure the fsync
869                  * triggered by the iomap_dio_complete() call below doesn't
870                  * deadlock on the inode lock - we are already holding it and we
871                  * can't call it after unlocking because we may need to complete
872                  * partial writes due to the input buffer (or parts of it) not
873                  * being already faulted in.
874                  */
875                 ASSERT(current->journal_info == NULL);
876                 current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
877                 ret = iomap_dio_complete(dio);
878                 current->journal_info = NULL;
879         }
880 
881         /* No increment (+=) because iomap returns a cumulative value. */
882         if (ret > 0)
883                 written = ret;
884 
885         if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
886                 const size_t left = iov_iter_count(from);
887                 /*
888                  * We have more data left to write. Try to fault in as many as
889                  * possible of the remainder pages and retry. We do this without
890                  * releasing and locking again the inode, to prevent races with
891                  * truncate.
892                  *
893                  * Also, in case the iov refers to pages in the file range of the
894                  * file we want to write to (due to a mmap), we could enter an
895                  * infinite loop if we retry after faulting the pages in, since
896                  * iomap will invalidate any pages in the range early on, before
897                  * it tries to fault in the pages of the iov. So we keep track of
898                  * how much was left of iov in the previous EFAULT and fallback
899                  * to buffered IO in case we haven't made any progress.
900                  */
901                 if (left == prev_left) {
902                         ret = -ENOTBLK;
903                 } else {
904                         fault_in_iov_iter_readable(from, left);
905                         prev_left = left;
906                         goto again;
907                 }
908         }
909 
910         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
911 
912         /*
913          * If 'ret' is -ENOTBLK or we have not written all data, then it means
914          * we must fallback to buffered IO.
915          */
916         if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
917                 goto out;
918 
919 buffered:
920         /*
921          * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
922          * it must retry the operation in a context where blocking is acceptable,
923          * because even if we end up not blocking during the buffered IO attempt
924          * below, we will block when flushing and waiting for the IO.
925          */
926         if (iocb->ki_flags & IOCB_NOWAIT) {
927                 ret = -EAGAIN;
928                 goto out;
929         }
930 
931         pos = iocb->ki_pos;
932         written_buffered = btrfs_buffered_write(iocb, from);
933         if (written_buffered < 0) {
934                 ret = written_buffered;
935                 goto out;
936         }
937         /*
938          * Ensure all data is persisted. We want the next direct IO read to be
939          * able to read what was just written.
940          */
941         endbyte = pos + written_buffered - 1;
942         ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
943         if (ret)
944                 goto out;
945         ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
946         if (ret)
947                 goto out;
948         written += written_buffered;
949         iocb->ki_pos = pos + written_buffered;
950         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
951                                  endbyte >> PAGE_SHIFT);
952 out:
953         return ret < 0 ? ret : written;
954 }
955 
956 static int check_direct_read(struct btrfs_fs_info *fs_info,
957                              const struct iov_iter *iter, loff_t offset)
958 {
959         int ret;
960         int i, seg;
961 
962         ret = check_direct_IO(fs_info, iter, offset);
963         if (ret < 0)
964                 return ret;
965 
966         if (!iter_is_iovec(iter))
967                 return 0;
968 
969         for (seg = 0; seg < iter->nr_segs; seg++) {
970                 for (i = seg + 1; i < iter->nr_segs; i++) {
971                         const struct iovec *iov1 = iter_iov(iter) + seg;
972                         const struct iovec *iov2 = iter_iov(iter) + i;
973 
974                         if (iov1->iov_base == iov2->iov_base)
975                                 return -EINVAL;
976                 }
977         }
978         return 0;
979 }
980 
981 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
982 {
983         struct inode *inode = file_inode(iocb->ki_filp);
984         size_t prev_left = 0;
985         ssize_t read = 0;
986         ssize_t ret;
987 
988         if (fsverity_active(inode))
989                 return 0;
990 
991         if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
992                 return 0;
993 
994         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
995 again:
996         /*
997          * This is similar to what we do for direct IO writes, see the comment
998          * at btrfs_direct_write(), but we also disable page faults in addition
999          * to disabling them only at the iov_iter level. This is because when
1000          * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1001          * which can still trigger page fault ins despite having set ->nofault
1002          * to true of our 'to' iov_iter.
1003          *
1004          * The difference to direct IO writes is that we deadlock when trying
1005          * to lock the extent range in the inode's tree during he page reads
1006          * triggered by the fault in (while for writes it is due to waiting for
1007          * our own ordered extent). This is because for direct IO reads,
1008          * btrfs_dio_iomap_begin() returns with the extent range locked, which
1009          * is only unlocked in the endio callback (end_bio_extent_readpage()).
1010          */
1011         pagefault_disable();
1012         to->nofault = true;
1013         ret = btrfs_dio_read(iocb, to, read);
1014         to->nofault = false;
1015         pagefault_enable();
1016 
1017         /* No increment (+=) because iomap returns a cumulative value. */
1018         if (ret > 0)
1019                 read = ret;
1020 
1021         if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1022                 const size_t left = iov_iter_count(to);
1023 
1024                 if (left == prev_left) {
1025                         /*
1026                          * We didn't make any progress since the last attempt,
1027                          * fallback to a buffered read for the remainder of the
1028                          * range. This is just to avoid any possibility of looping
1029                          * for too long.
1030                          */
1031                         ret = read;
1032                 } else {
1033                         /*
1034                          * We made some progress since the last retry or this is
1035                          * the first time we are retrying. Fault in as many pages
1036                          * as possible and retry.
1037                          */
1038                         fault_in_iov_iter_writeable(to, left);
1039                         prev_left = left;
1040                         goto again;
1041                 }
1042         }
1043         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1044         return ret < 0 ? ret : read;
1045 }
1046 
1047 int __init btrfs_init_dio(void)
1048 {
1049         if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1050                         offsetof(struct btrfs_dio_private, bbio.bio),
1051                         BIOSET_NEED_BVECS))
1052                 return -ENOMEM;
1053 
1054         return 0;
1055 }
1056 
1057 void __cold btrfs_destroy_dio(void)
1058 {
1059         bioset_exit(&btrfs_dio_bioset);
1060 }
1061 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php