~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
fiemap.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include "backref.h"
  4 #include "btrfs_inode.h"
  5 #include "fiemap.h"
  6 #include "file.h"
  7 #include "file-item.h"
  8 
  9 struct btrfs_fiemap_entry {
 10         u64 offset;
 11         u64 phys;
 12         u64 len;
 13         u32 flags;
 14 };
 15 
 16 /*
 17  * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
 18  * range from the inode's io tree, unlock the subvolume tree search path, flush
 19  * the fiemap cache and relock the file range and research the subvolume tree.
 20  * The value here is something negative that can't be confused with a valid
 21  * errno value and different from 1 because that's also a return value from
 22  * fiemap_fill_next_extent() and also it's often used to mean some btree search
 23  * did not find a key, so make it some distinct negative value.
 24  */
 25 #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
 26 
 27 /*
 28  * Used to:
 29  *
 30  * - Cache the next entry to be emitted to the fiemap buffer, so that we can
 31  *   merge extents that are contiguous and can be grouped as a single one;
 32  *
 33  * - Store extents ready to be written to the fiemap buffer in an intermediary
 34  *   buffer. This intermediary buffer is to ensure that in case the fiemap
 35  *   buffer is memory mapped to the fiemap target file, we don't deadlock
 36  *   during btrfs_page_mkwrite(). This is because during fiemap we are locking
 37  *   an extent range in order to prevent races with delalloc flushing and
 38  *   ordered extent completion, which is needed in order to reliably detect
 39  *   delalloc in holes and prealloc extents. And this can lead to a deadlock
 40  *   if the fiemap buffer is memory mapped to the file we are running fiemap
 41  *   against (a silly, useless in practice scenario, but possible) because
 42  *   btrfs_page_mkwrite() will try to lock the same extent range.
 43  */
 44 struct fiemap_cache {
 45         /* An array of ready fiemap entries. */
 46         struct btrfs_fiemap_entry *entries;
 47         /* Number of entries in the entries array. */
 48         int entries_size;
 49         /* Index of the next entry in the entries array to write to. */
 50         int entries_pos;
 51         /*
 52          * Once the entries array is full, this indicates what's the offset for
 53          * the next file extent item we must search for in the inode's subvolume
 54          * tree after unlocking the extent range in the inode's io tree and
 55          * releasing the search path.
 56          */
 57         u64 next_search_offset;
 58         /*
 59          * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
 60          * to count ourselves emitted extents and stop instead of relying on
 61          * fiemap_fill_next_extent() because we buffer ready fiemap entries at
 62          * the @entries array, and we want to stop as soon as we hit the max
 63          * amount of extents to map, not just to save time but also to make the
 64          * logic at extent_fiemap() simpler.
 65          */
 66         unsigned int extents_mapped;
 67         /* Fields for the cached extent (unsubmitted, not ready, extent). */
 68         u64 offset;
 69         u64 phys;
 70         u64 len;
 71         u32 flags;
 72         bool cached;
 73 };
 74 
 75 static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
 76                               struct fiemap_cache *cache)
 77 {
 78         for (int i = 0; i < cache->entries_pos; i++) {
 79                 struct btrfs_fiemap_entry *entry = &cache->entries[i];
 80                 int ret;
 81 
 82                 ret = fiemap_fill_next_extent(fieinfo, entry->offset,
 83                                               entry->phys, entry->len,
 84                                               entry->flags);
 85                 /*
 86                  * Ignore 1 (reached max entries) because we keep track of that
 87                  * ourselves in emit_fiemap_extent().
 88                  */
 89                 if (ret < 0)
 90                         return ret;
 91         }
 92         cache->entries_pos = 0;
 93 
 94         return 0;
 95 }
 96 
 97 /*
 98  * Helper to submit fiemap extent.
 99  *
100  * Will try to merge current fiemap extent specified by @offset, @phys,
101  * @len and @flags with cached one.
102  * And only when we fails to merge, cached one will be submitted as
103  * fiemap extent.
104  *
105  * Return value is the same as fiemap_fill_next_extent().
106  */
107 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
108                                 struct fiemap_cache *cache,
109                                 u64 offset, u64 phys, u64 len, u32 flags)
110 {
111         struct btrfs_fiemap_entry *entry;
112         u64 cache_end;
113 
114         /* Set at the end of extent_fiemap(). */
115         ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
116 
117         if (!cache->cached)
118                 goto assign;
119 
120         /*
121          * When iterating the extents of the inode, at extent_fiemap(), we may
122          * find an extent that starts at an offset behind the end offset of the
123          * previous extent we processed. This happens if fiemap is called
124          * without FIEMAP_FLAG_SYNC and there are ordered extents completing
125          * after we had to unlock the file range, release the search path, emit
126          * the fiemap extents stored in the buffer (cache->entries array) and
127          * the lock the remainder of the range and re-search the btree.
128          *
129          * For example we are in leaf X processing its last item, which is the
130          * file extent item for file range [512K, 1M[, and after
131          * btrfs_next_leaf() releases the path, there's an ordered extent that
132          * completes for the file range [768K, 2M[, and that results in trimming
133          * the file extent item so that it now corresponds to the file range
134          * [512K, 768K[ and a new file extent item is inserted for the file
135          * range [768K, 2M[, which may end up as the last item of leaf X or as
136          * the first item of the next leaf - in either case btrfs_next_leaf()
137          * will leave us with a path pointing to the new extent item, for the
138          * file range [768K, 2M[, since that's the first key that follows the
139          * last one we processed. So in order not to report overlapping extents
140          * to user space, we trim the length of the previously cached extent and
141          * emit it.
142          *
143          * Upon calling btrfs_next_leaf() we may also find an extent with an
144          * offset smaller than or equals to cache->offset, and this happens
145          * when we had a hole or prealloc extent with several delalloc ranges in
146          * it, but after btrfs_next_leaf() released the path, delalloc was
147          * flushed and the resulting ordered extents were completed, so we can
148          * now have found a file extent item for an offset that is smaller than
149          * or equals to what we have in cache->offset. We deal with this as
150          * described below.
151          */
152         cache_end = cache->offset + cache->len;
153         if (cache_end > offset) {
154                 if (offset == cache->offset) {
155                         /*
156                          * We cached a dealloc range (found in the io tree) for
157                          * a hole or prealloc extent and we have now found a
158                          * file extent item for the same offset. What we have
159                          * now is more recent and up to date, so discard what
160                          * we had in the cache and use what we have just found.
161                          */
162                         goto assign;
163                 } else if (offset > cache->offset) {
164                         /*
165                          * The extent range we previously found ends after the
166                          * offset of the file extent item we found and that
167                          * offset falls somewhere in the middle of that previous
168                          * extent range. So adjust the range we previously found
169                          * to end at the offset of the file extent item we have
170                          * just found, since this extent is more up to date.
171                          * Emit that adjusted range and cache the file extent
172                          * item we have just found. This corresponds to the case
173                          * where a previously found file extent item was split
174                          * due to an ordered extent completing.
175                          */
176                         cache->len = offset - cache->offset;
177                         goto emit;
178                 } else {
179                         const u64 range_end = offset + len;
180 
181                         /*
182                          * The offset of the file extent item we have just found
183                          * is behind the cached offset. This means we were
184                          * processing a hole or prealloc extent for which we
185                          * have found delalloc ranges (in the io tree), so what
186                          * we have in the cache is the last delalloc range we
187                          * found while the file extent item we found can be
188                          * either for a whole delalloc range we previously
189                          * emmitted or only a part of that range.
190                          *
191                          * We have two cases here:
192                          *
193                          * 1) The file extent item's range ends at or behind the
194                          *    cached extent's end. In this case just ignore the
195                          *    current file extent item because we don't want to
196                          *    overlap with previous ranges that may have been
197                          *    emmitted already;
198                          *
199                          * 2) The file extent item starts behind the currently
200                          *    cached extent but its end offset goes beyond the
201                          *    end offset of the cached extent. We don't want to
202                          *    overlap with a previous range that may have been
203                          *    emmitted already, so we emit the currently cached
204                          *    extent and then partially store the current file
205                          *    extent item's range in the cache, for the subrange
206                          *    going the cached extent's end to the end of the
207                          *    file extent item.
208                          */
209                         if (range_end <= cache_end)
210                                 return 0;
211 
212                         if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
213                                 phys += cache_end - offset;
214 
215                         offset = cache_end;
216                         len = range_end - cache_end;
217                         goto emit;
218                 }
219         }
220 
221         /*
222          * Only merges fiemap extents if
223          * 1) Their logical addresses are continuous
224          *
225          * 2) Their physical addresses are continuous
226          *    So truly compressed (physical size smaller than logical size)
227          *    extents won't get merged with each other
228          *
229          * 3) Share same flags
230          */
231         if (cache->offset + cache->len  == offset &&
232             cache->phys + cache->len == phys  &&
233             cache->flags == flags) {
234                 cache->len += len;
235                 return 0;
236         }
237 
238 emit:
239         /* Not mergeable, need to submit cached one */
240 
241         if (cache->entries_pos == cache->entries_size) {
242                 /*
243                  * We will need to research for the end offset of the last
244                  * stored extent and not from the current offset, because after
245                  * unlocking the range and releasing the path, if there's a hole
246                  * between that end offset and this current offset, a new extent
247                  * may have been inserted due to a new write, so we don't want
248                  * to miss it.
249                  */
250                 entry = &cache->entries[cache->entries_size - 1];
251                 cache->next_search_offset = entry->offset + entry->len;
252                 cache->cached = false;
253 
254                 return BTRFS_FIEMAP_FLUSH_CACHE;
255         }
256 
257         entry = &cache->entries[cache->entries_pos];
258         entry->offset = cache->offset;
259         entry->phys = cache->phys;
260         entry->len = cache->len;
261         entry->flags = cache->flags;
262         cache->entries_pos++;
263         cache->extents_mapped++;
264 
265         if (cache->extents_mapped == fieinfo->fi_extents_max) {
266                 cache->cached = false;
267                 return 1;
268         }
269 assign:
270         cache->cached = true;
271         cache->offset = offset;
272         cache->phys = phys;
273         cache->len = len;
274         cache->flags = flags;
275 
276         return 0;
277 }
278 
279 /*
280  * Emit last fiemap cache
281  *
282  * The last fiemap cache may still be cached in the following case:
283  * 0                  4k                    8k
284  * |<- Fiemap range ->|
285  * |<------------  First extent ----------->|
286  *
287  * In this case, the first extent range will be cached but not emitted.
288  * So we must emit it before ending extent_fiemap().
289  */
290 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
291                                   struct fiemap_cache *cache)
292 {
293         int ret;
294 
295         if (!cache->cached)
296                 return 0;
297 
298         ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
299                                       cache->len, cache->flags);
300         cache->cached = false;
301         if (ret > 0)
302                 ret = 0;
303         return ret;
304 }
305 
306 static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
307 {
308         struct extent_buffer *clone = path->nodes[0];
309         struct btrfs_key key;
310         int slot;
311         int ret;
312 
313         path->slots[0]++;
314         if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
315                 return 0;
316 
317         /*
318          * Add a temporary extra ref to an already cloned extent buffer to
319          * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
320          * the cost of allocating a new one.
321          */
322         ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
323         atomic_inc(&clone->refs);
324 
325         ret = btrfs_next_leaf(inode->root, path);
326         if (ret != 0)
327                 goto out;
328 
329         /*
330          * Don't bother with cloning if there are no more file extent items for
331          * our inode.
332          */
333         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
334         if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
335                 ret = 1;
336                 goto out;
337         }
338 
339         /*
340          * Important to preserve the start field, for the optimizations when
341          * checking if extents are shared (see extent_fiemap()).
342          *
343          * We must set ->start before calling copy_extent_buffer_full().  If we
344          * are on sub-pagesize blocksize, we use ->start to determine the offset
345          * into the folio where our eb exists, and if we update ->start after
346          * the fact then any subsequent reads of the eb may read from a
347          * different offset in the folio than where we originally copied into.
348          */
349         clone->start = path->nodes[0]->start;
350         /* See the comment at fiemap_search_slot() about why we clone. */
351         copy_extent_buffer_full(clone, path->nodes[0]);
352 
353         slot = path->slots[0];
354         btrfs_release_path(path);
355         path->nodes[0] = clone;
356         path->slots[0] = slot;
357 out:
358         if (ret)
359                 free_extent_buffer(clone);
360 
361         return ret;
362 }
363 
364 /*
365  * Search for the first file extent item that starts at a given file offset or
366  * the one that starts immediately before that offset.
367  * Returns: 0 on success, < 0 on error, 1 if not found.
368  */
369 static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
370                               u64 file_offset)
371 {
372         const u64 ino = btrfs_ino(inode);
373         struct btrfs_root *root = inode->root;
374         struct extent_buffer *clone;
375         struct btrfs_key key;
376         int slot;
377         int ret;
378 
379         key.objectid = ino;
380         key.type = BTRFS_EXTENT_DATA_KEY;
381         key.offset = file_offset;
382 
383         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
384         if (ret < 0)
385                 return ret;
386 
387         if (ret > 0 && path->slots[0] > 0) {
388                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
389                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
390                         path->slots[0]--;
391         }
392 
393         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
394                 ret = btrfs_next_leaf(root, path);
395                 if (ret != 0)
396                         return ret;
397 
398                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
399                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
400                         return 1;
401         }
402 
403         /*
404          * We clone the leaf and use it during fiemap. This is because while
405          * using the leaf we do expensive things like checking if an extent is
406          * shared, which can take a long time. In order to prevent blocking
407          * other tasks for too long, we use a clone of the leaf. We have locked
408          * the file range in the inode's io tree, so we know none of our file
409          * extent items can change. This way we avoid blocking other tasks that
410          * want to insert items for other inodes in the same leaf or b+tree
411          * rebalance operations (triggered for example when someone is trying
412          * to push items into this leaf when trying to insert an item in a
413          * neighbour leaf).
414          * We also need the private clone because holding a read lock on an
415          * extent buffer of the subvolume's b+tree will make lockdep unhappy
416          * when we check if extents are shared, as backref walking may need to
417          * lock the same leaf we are processing.
418          */
419         clone = btrfs_clone_extent_buffer(path->nodes[0]);
420         if (!clone)
421                 return -ENOMEM;
422 
423         slot = path->slots[0];
424         btrfs_release_path(path);
425         path->nodes[0] = clone;
426         path->slots[0] = slot;
427 
428         return 0;
429 }
430 
431 /*
432  * Process a range which is a hole or a prealloc extent in the inode's subvolume
433  * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
434  * extent. The end offset (@end) is inclusive.
435  */
436 static int fiemap_process_hole(struct btrfs_inode *inode,
437                                struct fiemap_extent_info *fieinfo,
438                                struct fiemap_cache *cache,
439                                struct extent_state **delalloc_cached_state,
440                                struct btrfs_backref_share_check_ctx *backref_ctx,
441                                u64 disk_bytenr, u64 extent_offset,
442                                u64 extent_gen,
443                                u64 start, u64 end)
444 {
445         const u64 i_size = i_size_read(&inode->vfs_inode);
446         u64 cur_offset = start;
447         u64 last_delalloc_end = 0;
448         u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
449         bool checked_extent_shared = false;
450         int ret;
451 
452         /*
453          * There can be no delalloc past i_size, so don't waste time looking for
454          * it beyond i_size.
455          */
456         while (cur_offset < end && cur_offset < i_size) {
457                 u64 delalloc_start;
458                 u64 delalloc_end;
459                 u64 prealloc_start;
460                 u64 prealloc_len = 0;
461                 bool delalloc;
462 
463                 delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
464                                                         delalloc_cached_state,
465                                                         &delalloc_start,
466                                                         &delalloc_end);
467                 if (!delalloc)
468                         break;
469 
470                 /*
471                  * If this is a prealloc extent we have to report every section
472                  * of it that has no delalloc.
473                  */
474                 if (disk_bytenr != 0) {
475                         if (last_delalloc_end == 0) {
476                                 prealloc_start = start;
477                                 prealloc_len = delalloc_start - start;
478                         } else {
479                                 prealloc_start = last_delalloc_end + 1;
480                                 prealloc_len = delalloc_start - prealloc_start;
481                         }
482                 }
483 
484                 if (prealloc_len > 0) {
485                         if (!checked_extent_shared && fieinfo->fi_extents_max) {
486                                 ret = btrfs_is_data_extent_shared(inode,
487                                                                   disk_bytenr,
488                                                                   extent_gen,
489                                                                   backref_ctx);
490                                 if (ret < 0)
491                                         return ret;
492                                 else if (ret > 0)
493                                         prealloc_flags |= FIEMAP_EXTENT_SHARED;
494 
495                                 checked_extent_shared = true;
496                         }
497                         ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
498                                                  disk_bytenr + extent_offset,
499                                                  prealloc_len, prealloc_flags);
500                         if (ret)
501                                 return ret;
502                         extent_offset += prealloc_len;
503                 }
504 
505                 ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
506                                          delalloc_end + 1 - delalloc_start,
507                                          FIEMAP_EXTENT_DELALLOC |
508                                          FIEMAP_EXTENT_UNKNOWN);
509                 if (ret)
510                         return ret;
511 
512                 last_delalloc_end = delalloc_end;
513                 cur_offset = delalloc_end + 1;
514                 extent_offset += cur_offset - delalloc_start;
515                 cond_resched();
516         }
517 
518         /*
519          * Either we found no delalloc for the whole prealloc extent or we have
520          * a prealloc extent that spans i_size or starts at or after i_size.
521          */
522         if (disk_bytenr != 0 && last_delalloc_end < end) {
523                 u64 prealloc_start;
524                 u64 prealloc_len;
525 
526                 if (last_delalloc_end == 0) {
527                         prealloc_start = start;
528                         prealloc_len = end + 1 - start;
529                 } else {
530                         prealloc_start = last_delalloc_end + 1;
531                         prealloc_len = end + 1 - prealloc_start;
532                 }
533 
534                 if (!checked_extent_shared && fieinfo->fi_extents_max) {
535                         ret = btrfs_is_data_extent_shared(inode,
536                                                           disk_bytenr,
537                                                           extent_gen,
538                                                           backref_ctx);
539                         if (ret < 0)
540                                 return ret;
541                         else if (ret > 0)
542                                 prealloc_flags |= FIEMAP_EXTENT_SHARED;
543                 }
544                 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
545                                          disk_bytenr + extent_offset,
546                                          prealloc_len, prealloc_flags);
547                 if (ret)
548                         return ret;
549         }
550 
551         return 0;
552 }
553 
554 static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
555                                           struct btrfs_path *path,
556                                           u64 *last_extent_end_ret)
557 {
558         const u64 ino = btrfs_ino(inode);
559         struct btrfs_root *root = inode->root;
560         struct extent_buffer *leaf;
561         struct btrfs_file_extent_item *ei;
562         struct btrfs_key key;
563         u64 disk_bytenr;
564         int ret;
565 
566         /*
567          * Lookup the last file extent. We're not using i_size here because
568          * there might be preallocation past i_size.
569          */
570         ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
571         /* There can't be a file extent item at offset (u64)-1 */
572         ASSERT(ret != 0);
573         if (ret < 0)
574                 return ret;
575 
576         /*
577          * For a non-existing key, btrfs_search_slot() always leaves us at a
578          * slot > 0, except if the btree is empty, which is impossible because
579          * at least it has the inode item for this inode and all the items for
580          * the root inode 256.
581          */
582         ASSERT(path->slots[0] > 0);
583         path->slots[0]--;
584         leaf = path->nodes[0];
585         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
586         if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
587                 /* No file extent items in the subvolume tree. */
588                 *last_extent_end_ret = 0;
589                 return 0;
590         }
591 
592         /*
593          * For an inline extent, the disk_bytenr is where inline data starts at,
594          * so first check if we have an inline extent item before checking if we
595          * have an implicit hole (disk_bytenr == 0).
596          */
597         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
598         if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
599                 *last_extent_end_ret = btrfs_file_extent_end(path);
600                 return 0;
601         }
602 
603         /*
604          * Find the last file extent item that is not a hole (when NO_HOLES is
605          * not enabled). This should take at most 2 iterations in the worst
606          * case: we have one hole file extent item at slot 0 of a leaf and
607          * another hole file extent item as the last item in the previous leaf.
608          * This is because we merge file extent items that represent holes.
609          */
610         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
611         while (disk_bytenr == 0) {
612                 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
613                 if (ret < 0) {
614                         return ret;
615                 } else if (ret > 0) {
616                         /* No file extent items that are not holes. */
617                         *last_extent_end_ret = 0;
618                         return 0;
619                 }
620                 leaf = path->nodes[0];
621                 ei = btrfs_item_ptr(leaf, path->slots[0],
622                                     struct btrfs_file_extent_item);
623                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
624         }
625 
626         *last_extent_end_ret = btrfs_file_extent_end(path);
627         return 0;
628 }
629 
630 static int extent_fiemap(struct btrfs_inode *inode,
631                          struct fiemap_extent_info *fieinfo,
632                          u64 start, u64 len)
633 {
634         const u64 ino = btrfs_ino(inode);
635         struct extent_state *cached_state = NULL;
636         struct extent_state *delalloc_cached_state = NULL;
637         struct btrfs_path *path;
638         struct fiemap_cache cache = { 0 };
639         struct btrfs_backref_share_check_ctx *backref_ctx;
640         u64 last_extent_end = 0;
641         u64 prev_extent_end;
642         u64 range_start;
643         u64 range_end;
644         const u64 sectorsize = inode->root->fs_info->sectorsize;
645         bool stopped = false;
646         int ret;
647 
648         cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
649         cache.entries = kmalloc_array(cache.entries_size,
650                                       sizeof(struct btrfs_fiemap_entry),
651                                       GFP_KERNEL);
652         backref_ctx = btrfs_alloc_backref_share_check_ctx();
653         path = btrfs_alloc_path();
654         if (!cache.entries || !backref_ctx || !path) {
655                 ret = -ENOMEM;
656                 goto out;
657         }
658 
659 restart:
660         range_start = round_down(start, sectorsize);
661         range_end = round_up(start + len, sectorsize);
662         prev_extent_end = range_start;
663 
664         lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
665 
666         ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
667         if (ret < 0)
668                 goto out_unlock;
669         btrfs_release_path(path);
670 
671         path->reada = READA_FORWARD;
672         ret = fiemap_search_slot(inode, path, range_start);
673         if (ret < 0) {
674                 goto out_unlock;
675         } else if (ret > 0) {
676                 /*
677                  * No file extent item found, but we may have delalloc between
678                  * the current offset and i_size. So check for that.
679                  */
680                 ret = 0;
681                 goto check_eof_delalloc;
682         }
683 
684         while (prev_extent_end < range_end) {
685                 struct extent_buffer *leaf = path->nodes[0];
686                 struct btrfs_file_extent_item *ei;
687                 struct btrfs_key key;
688                 u64 extent_end;
689                 u64 extent_len;
690                 u64 extent_offset = 0;
691                 u64 extent_gen;
692                 u64 disk_bytenr = 0;
693                 u64 flags = 0;
694                 int extent_type;
695                 u8 compression;
696 
697                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
698                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
699                         break;
700 
701                 extent_end = btrfs_file_extent_end(path);
702 
703                 /*
704                  * The first iteration can leave us at an extent item that ends
705                  * before our range's start. Move to the next item.
706                  */
707                 if (extent_end <= range_start)
708                         goto next_item;
709 
710                 backref_ctx->curr_leaf_bytenr = leaf->start;
711 
712                 /* We have in implicit hole (NO_HOLES feature enabled). */
713                 if (prev_extent_end < key.offset) {
714                         const u64 hole_end = min(key.offset, range_end) - 1;
715 
716                         ret = fiemap_process_hole(inode, fieinfo, &cache,
717                                                   &delalloc_cached_state,
718                                                   backref_ctx, 0, 0, 0,
719                                                   prev_extent_end, hole_end);
720                         if (ret < 0) {
721                                 goto out_unlock;
722                         } else if (ret > 0) {
723                                 /* fiemap_fill_next_extent() told us to stop. */
724                                 stopped = true;
725                                 break;
726                         }
727 
728                         /* We've reached the end of the fiemap range, stop. */
729                         if (key.offset >= range_end) {
730                                 stopped = true;
731                                 break;
732                         }
733                 }
734 
735                 extent_len = extent_end - key.offset;
736                 ei = btrfs_item_ptr(leaf, path->slots[0],
737                                     struct btrfs_file_extent_item);
738                 compression = btrfs_file_extent_compression(leaf, ei);
739                 extent_type = btrfs_file_extent_type(leaf, ei);
740                 extent_gen = btrfs_file_extent_generation(leaf, ei);
741 
742                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
743                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
744                         if (compression == BTRFS_COMPRESS_NONE)
745                                 extent_offset = btrfs_file_extent_offset(leaf, ei);
746                 }
747 
748                 if (compression != BTRFS_COMPRESS_NONE)
749                         flags |= FIEMAP_EXTENT_ENCODED;
750 
751                 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
752                         flags |= FIEMAP_EXTENT_DATA_INLINE;
753                         flags |= FIEMAP_EXTENT_NOT_ALIGNED;
754                         ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
755                                                  extent_len, flags);
756                 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
757                         ret = fiemap_process_hole(inode, fieinfo, &cache,
758                                                   &delalloc_cached_state,
759                                                   backref_ctx,
760                                                   disk_bytenr, extent_offset,
761                                                   extent_gen, key.offset,
762                                                   extent_end - 1);
763                 } else if (disk_bytenr == 0) {
764                         /* We have an explicit hole. */
765                         ret = fiemap_process_hole(inode, fieinfo, &cache,
766                                                   &delalloc_cached_state,
767                                                   backref_ctx, 0, 0, 0,
768                                                   key.offset, extent_end - 1);
769                 } else {
770                         /* We have a regular extent. */
771                         if (fieinfo->fi_extents_max) {
772                                 ret = btrfs_is_data_extent_shared(inode,
773                                                                   disk_bytenr,
774                                                                   extent_gen,
775                                                                   backref_ctx);
776                                 if (ret < 0)
777                                         goto out_unlock;
778                                 else if (ret > 0)
779                                         flags |= FIEMAP_EXTENT_SHARED;
780                         }
781 
782                         ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
783                                                  disk_bytenr + extent_offset,
784                                                  extent_len, flags);
785                 }
786 
787                 if (ret < 0) {
788                         goto out_unlock;
789                 } else if (ret > 0) {
790                         /* emit_fiemap_extent() told us to stop. */
791                         stopped = true;
792                         break;
793                 }
794 
795                 prev_extent_end = extent_end;
796 next_item:
797                 if (fatal_signal_pending(current)) {
798                         ret = -EINTR;
799                         goto out_unlock;
800                 }
801 
802                 ret = fiemap_next_leaf_item(inode, path);
803                 if (ret < 0) {
804                         goto out_unlock;
805                 } else if (ret > 0) {
806                         /* No more file extent items for this inode. */
807                         break;
808                 }
809                 cond_resched();
810         }
811 
812 check_eof_delalloc:
813         if (!stopped && prev_extent_end < range_end) {
814                 ret = fiemap_process_hole(inode, fieinfo, &cache,
815                                           &delalloc_cached_state, backref_ctx,
816                                           0, 0, 0, prev_extent_end, range_end - 1);
817                 if (ret < 0)
818                         goto out_unlock;
819                 prev_extent_end = range_end;
820         }
821 
822         if (cache.cached && cache.offset + cache.len >= last_extent_end) {
823                 const u64 i_size = i_size_read(&inode->vfs_inode);
824 
825                 if (prev_extent_end < i_size) {
826                         u64 delalloc_start;
827                         u64 delalloc_end;
828                         bool delalloc;
829 
830                         delalloc = btrfs_find_delalloc_in_range(inode,
831                                                                 prev_extent_end,
832                                                                 i_size - 1,
833                                                                 &delalloc_cached_state,
834                                                                 &delalloc_start,
835                                                                 &delalloc_end);
836                         if (!delalloc)
837                                 cache.flags |= FIEMAP_EXTENT_LAST;
838                 } else {
839                         cache.flags |= FIEMAP_EXTENT_LAST;
840                 }
841         }
842 
843 out_unlock:
844         unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
845 
846         if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
847                 btrfs_release_path(path);
848                 ret = flush_fiemap_cache(fieinfo, &cache);
849                 if (ret)
850                         goto out;
851                 len -= cache.next_search_offset - start;
852                 start = cache.next_search_offset;
853                 goto restart;
854         } else if (ret < 0) {
855                 goto out;
856         }
857 
858         /*
859          * Must free the path before emitting to the fiemap buffer because we
860          * may have a non-cloned leaf and if the fiemap buffer is memory mapped
861          * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
862          * waiting for an ordered extent that in order to complete needs to
863          * modify that leaf, therefore leading to a deadlock.
864          */
865         btrfs_free_path(path);
866         path = NULL;
867 
868         ret = flush_fiemap_cache(fieinfo, &cache);
869         if (ret)
870                 goto out;
871 
872         ret = emit_last_fiemap_cache(fieinfo, &cache);
873 out:
874         free_extent_state(delalloc_cached_state);
875         kfree(cache.entries);
876         btrfs_free_backref_share_ctx(backref_ctx);
877         btrfs_free_path(path);
878         return ret;
879 }
880 
881 int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
882                  u64 start, u64 len)
883 {
884         struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
885         int ret;
886 
887         ret = fiemap_prep(inode, fieinfo, start, &len, 0);
888         if (ret)
889                 return ret;
890 
891         /*
892          * fiemap_prep() called filemap_write_and_wait() for the whole possible
893          * file range (0 to LLONG_MAX), but that is not enough if we have
894          * compression enabled. The first filemap_fdatawrite_range() only kicks
895          * in the compression of data (in an async thread) and will return
896          * before the compression is done and writeback is started. A second
897          * filemap_fdatawrite_range() is needed to wait for the compression to
898          * complete and writeback to start. We also need to wait for ordered
899          * extents to complete, because our fiemap implementation uses mainly
900          * file extent items to list the extents, searching for extent maps
901          * only for file ranges with holes or prealloc extents to figure out
902          * if we have delalloc in those ranges.
903          */
904         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
905                 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
906                 if (ret)
907                         return ret;
908         }
909 
910         btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
911 
912         /*
913          * We did an initial flush to avoid holding the inode's lock while
914          * triggering writeback and waiting for the completion of IO and ordered
915          * extents. Now after we locked the inode we do it again, because it's
916          * possible a new write may have happened in between those two steps.
917          */
918         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
919                 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
920                 if (ret) {
921                         btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
922                         return ret;
923                 }
924         }
925 
926         ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
927         btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
928 
929         return ret;
930 }
931
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/fs/btrfs/fiemap.c

TOMOYO Linux Cross Reference
Linux/fs/btrfs/fiemap.c