~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/extent-tree.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright (C) 2007 Oracle.  All rights reserved.
  4  */
  5 
  6 #include <linux/sched.h>
  7 #include <linux/sched/signal.h>
  8 #include <linux/pagemap.h>
  9 #include <linux/writeback.h>
 10 #include <linux/blkdev.h>
 11 #include <linux/sort.h>
 12 #include <linux/rcupdate.h>
 13 #include <linux/kthread.h>
 14 #include <linux/slab.h>
 15 #include <linux/ratelimit.h>
 16 #include <linux/percpu_counter.h>
 17 #include <linux/lockdep.h>
 18 #include <linux/crc32c.h>
 19 #include "ctree.h"
 20 #include "extent-tree.h"
 21 #include "transaction.h"
 22 #include "disk-io.h"
 23 #include "print-tree.h"
 24 #include "volumes.h"
 25 #include "raid56.h"
 26 #include "locking.h"
 27 #include "free-space-cache.h"
 28 #include "free-space-tree.h"
 29 #include "qgroup.h"
 30 #include "ref-verify.h"
 31 #include "space-info.h"
 32 #include "block-rsv.h"
 33 #include "discard.h"
 34 #include "zoned.h"
 35 #include "dev-replace.h"
 36 #include "fs.h"
 37 #include "accessors.h"
 38 #include "root-tree.h"
 39 #include "file-item.h"
 40 #include "orphan.h"
 41 #include "tree-checker.h"
 42 #include "raid-stripe-tree.h"
 43 
 44 #undef SCRAMBLE_DELAYED_REFS
 45 
 46 
 47 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 48                                struct btrfs_delayed_ref_head *href,
 49                                struct btrfs_delayed_ref_node *node,
 50                                struct btrfs_delayed_extent_op *extra_op);
 51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 52                                     struct extent_buffer *leaf,
 53                                     struct btrfs_extent_item *ei);
 54 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 55                                       u64 parent, u64 root_objectid,
 56                                       u64 flags, u64 owner, u64 offset,
 57                                       struct btrfs_key *ins, int ref_mod, u64 oref_root);
 58 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 59                                      struct btrfs_delayed_ref_node *node,
 60                                      struct btrfs_delayed_extent_op *extent_op);
 61 static int find_next_key(struct btrfs_path *path, int level,
 62                          struct btrfs_key *key);
 63 
 64 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
 65 {
 66         return (cache->flags & bits) == bits;
 67 }
 68 
 69 /* simple helper to search for an existing data extent at a given offset */
 70 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 71 {
 72         struct btrfs_root *root = btrfs_extent_root(fs_info, start);
 73         int ret;
 74         struct btrfs_key key;
 75         struct btrfs_path *path;
 76 
 77         path = btrfs_alloc_path();
 78         if (!path)
 79                 return -ENOMEM;
 80 
 81         key.objectid = start;
 82         key.offset = len;
 83         key.type = BTRFS_EXTENT_ITEM_KEY;
 84         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 85         btrfs_free_path(path);
 86         return ret;
 87 }
 88 
 89 /*
 90  * helper function to lookup reference count and flags of a tree block.
 91  *
 92  * the head node for delayed ref is used to store the sum of all the
 93  * reference count modifications queued up in the rbtree. the head
 94  * node may also store the extent flags to set. This way you can check
 95  * to see what the reference count and extent flags would be if all of
 96  * the delayed refs are not processed.
 97  */
 98 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 99                              struct btrfs_fs_info *fs_info, u64 bytenr,
100                              u64 offset, int metadata, u64 *refs, u64 *flags,
101                              u64 *owning_root)
102 {
103         struct btrfs_root *extent_root;
104         struct btrfs_delayed_ref_head *head;
105         struct btrfs_delayed_ref_root *delayed_refs;
106         struct btrfs_path *path;
107         struct btrfs_key key;
108         u64 num_refs;
109         u64 extent_flags;
110         u64 owner = 0;
111         int ret;
112 
113         /*
114          * If we don't have skinny metadata, don't bother doing anything
115          * different
116          */
117         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
118                 offset = fs_info->nodesize;
119                 metadata = 0;
120         }
121 
122         path = btrfs_alloc_path();
123         if (!path)
124                 return -ENOMEM;
125 
126 search_again:
127         key.objectid = bytenr;
128         key.offset = offset;
129         if (metadata)
130                 key.type = BTRFS_METADATA_ITEM_KEY;
131         else
132                 key.type = BTRFS_EXTENT_ITEM_KEY;
133 
134         extent_root = btrfs_extent_root(fs_info, bytenr);
135         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
136         if (ret < 0)
137                 goto out_free;
138 
139         if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
140                 if (path->slots[0]) {
141                         path->slots[0]--;
142                         btrfs_item_key_to_cpu(path->nodes[0], &key,
143                                               path->slots[0]);
144                         if (key.objectid == bytenr &&
145                             key.type == BTRFS_EXTENT_ITEM_KEY &&
146                             key.offset == fs_info->nodesize)
147                                 ret = 0;
148                 }
149         }
150 
151         if (ret == 0) {
152                 struct extent_buffer *leaf = path->nodes[0];
153                 struct btrfs_extent_item *ei;
154                 const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
155 
156                 if (unlikely(item_size < sizeof(*ei))) {
157                         ret = -EUCLEAN;
158                         btrfs_err(fs_info,
159                         "unexpected extent item size, has %u expect >= %zu",
160                                   item_size, sizeof(*ei));
161                         btrfs_abort_transaction(trans, ret);
162                         goto out_free;
163                 }
164 
165                 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
166                 num_refs = btrfs_extent_refs(leaf, ei);
167                 if (unlikely(num_refs == 0)) {
168                         ret = -EUCLEAN;
169                         btrfs_err(fs_info,
170                 "unexpected zero reference count for extent item (%llu %u %llu)",
171                                   key.objectid, key.type, key.offset);
172                         btrfs_abort_transaction(trans, ret);
173                         goto out_free;
174                 }
175                 extent_flags = btrfs_extent_flags(leaf, ei);
176                 owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
177         } else {
178                 num_refs = 0;
179                 extent_flags = 0;
180                 ret = 0;
181         }
182 
183         delayed_refs = &trans->transaction->delayed_refs;
184         spin_lock(&delayed_refs->lock);
185         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
186         if (head) {
187                 if (!mutex_trylock(&head->mutex)) {
188                         refcount_inc(&head->refs);
189                         spin_unlock(&delayed_refs->lock);
190 
191                         btrfs_release_path(path);
192 
193                         /*
194                          * Mutex was contended, block until it's released and try
195                          * again
196                          */
197                         mutex_lock(&head->mutex);
198                         mutex_unlock(&head->mutex);
199                         btrfs_put_delayed_ref_head(head);
200                         goto search_again;
201                 }
202                 spin_lock(&head->lock);
203                 if (head->extent_op && head->extent_op->update_flags)
204                         extent_flags |= head->extent_op->flags_to_set;
205 
206                 num_refs += head->ref_mod;
207                 spin_unlock(&head->lock);
208                 mutex_unlock(&head->mutex);
209         }
210         spin_unlock(&delayed_refs->lock);
211 
212         WARN_ON(num_refs == 0);
213         if (refs)
214                 *refs = num_refs;
215         if (flags)
216                 *flags = extent_flags;
217         if (owning_root)
218                 *owning_root = owner;
219 out_free:
220         btrfs_free_path(path);
221         return ret;
222 }
223 
224 /*
225  * Back reference rules.  Back refs have three main goals:
226  *
227  * 1) differentiate between all holders of references to an extent so that
228  *    when a reference is dropped we can make sure it was a valid reference
229  *    before freeing the extent.
230  *
231  * 2) Provide enough information to quickly find the holders of an extent
232  *    if we notice a given block is corrupted or bad.
233  *
234  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
235  *    maintenance.  This is actually the same as #2, but with a slightly
236  *    different use case.
237  *
238  * There are two kinds of back refs. The implicit back refs is optimized
239  * for pointers in non-shared tree blocks. For a given pointer in a block,
240  * back refs of this kind provide information about the block's owner tree
241  * and the pointer's key. These information allow us to find the block by
242  * b-tree searching. The full back refs is for pointers in tree blocks not
243  * referenced by their owner trees. The location of tree block is recorded
244  * in the back refs. Actually the full back refs is generic, and can be
245  * used in all cases the implicit back refs is used. The major shortcoming
246  * of the full back refs is its overhead. Every time a tree block gets
247  * COWed, we have to update back refs entry for all pointers in it.
248  *
249  * For a newly allocated tree block, we use implicit back refs for
250  * pointers in it. This means most tree related operations only involve
251  * implicit back refs. For a tree block created in old transaction, the
252  * only way to drop a reference to it is COW it. So we can detect the
253  * event that tree block loses its owner tree's reference and do the
254  * back refs conversion.
255  *
256  * When a tree block is COWed through a tree, there are four cases:
257  *
258  * The reference count of the block is one and the tree is the block's
259  * owner tree. Nothing to do in this case.
260  *
261  * The reference count of the block is one and the tree is not the
262  * block's owner tree. In this case, full back refs is used for pointers
263  * in the block. Remove these full back refs, add implicit back refs for
264  * every pointers in the new block.
265  *
266  * The reference count of the block is greater than one and the tree is
267  * the block's owner tree. In this case, implicit back refs is used for
268  * pointers in the block. Add full back refs for every pointers in the
269  * block, increase lower level extents' reference counts. The original
270  * implicit back refs are entailed to the new block.
271  *
272  * The reference count of the block is greater than one and the tree is
273  * not the block's owner tree. Add implicit back refs for every pointer in
274  * the new block, increase lower level extents' reference count.
275  *
276  * Back Reference Key composing:
277  *
278  * The key objectid corresponds to the first byte in the extent,
279  * The key type is used to differentiate between types of back refs.
280  * There are different meanings of the key offset for different types
281  * of back refs.
282  *
283  * File extents can be referenced by:
284  *
285  * - multiple snapshots, subvolumes, or different generations in one subvol
286  * - different files inside a single subvolume
287  * - different offsets inside a file (bookend extents in file.c)
288  *
289  * The extent ref structure for the implicit back refs has fields for:
290  *
291  * - Objectid of the subvolume root
292  * - objectid of the file holding the reference
293  * - original offset in the file
294  * - how many bookend extents
295  *
296  * The key offset for the implicit back refs is hash of the first
297  * three fields.
298  *
299  * The extent ref structure for the full back refs has field for:
300  *
301  * - number of pointers in the tree leaf
302  *
303  * The key offset for the implicit back refs is the first byte of
304  * the tree leaf
305  *
306  * When a file extent is allocated, The implicit back refs is used.
307  * the fields are filled in:
308  *
309  *     (root_key.objectid, inode objectid, offset in file, 1)
310  *
311  * When a file extent is removed file truncation, we find the
312  * corresponding implicit back refs and check the following fields:
313  *
314  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
315  *
316  * Btree extents can be referenced by:
317  *
318  * - Different subvolumes
319  *
320  * Both the implicit back refs and the full back refs for tree blocks
321  * only consist of key. The key offset for the implicit back refs is
322  * objectid of block's owner tree. The key offset for the full back refs
323  * is the first byte of parent block.
324  *
325  * When implicit back refs is used, information about the lowest key and
326  * level of the tree block are required. These information are stored in
327  * tree block info structure.
328  */
329 
330 /*
331  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
332  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
333  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
334  */
335 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
336                                      struct btrfs_extent_inline_ref *iref,
337                                      enum btrfs_inline_ref_type is_data)
338 {
339         struct btrfs_fs_info *fs_info = eb->fs_info;
340         int type = btrfs_extent_inline_ref_type(eb, iref);
341         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
342 
343         if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
344                 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
345                 return type;
346         }
347 
348         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
349             type == BTRFS_SHARED_BLOCK_REF_KEY ||
350             type == BTRFS_SHARED_DATA_REF_KEY ||
351             type == BTRFS_EXTENT_DATA_REF_KEY) {
352                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
353                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
354                                 return type;
355                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
356                                 ASSERT(fs_info);
357                                 /*
358                                  * Every shared one has parent tree block,
359                                  * which must be aligned to sector size.
360                                  */
361                                 if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
362                                         return type;
363                         }
364                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
365                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
366                                 return type;
367                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
368                                 ASSERT(fs_info);
369                                 /*
370                                  * Every shared one has parent tree block,
371                                  * which must be aligned to sector size.
372                                  */
373                                 if (offset &&
374                                     IS_ALIGNED(offset, fs_info->sectorsize))
375                                         return type;
376                         }
377                 } else {
378                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
379                         return type;
380                 }
381         }
382 
383         WARN_ON(1);
384         btrfs_print_leaf(eb);
385         btrfs_err(fs_info,
386                   "eb %llu iref 0x%lx invalid extent inline ref type %d",
387                   eb->start, (unsigned long)iref, type);
388 
389         return BTRFS_REF_TYPE_INVALID;
390 }
391 
392 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
393 {
394         u32 high_crc = ~(u32)0;
395         u32 low_crc = ~(u32)0;
396         __le64 lenum;
397 
398         lenum = cpu_to_le64(root_objectid);
399         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
400         lenum = cpu_to_le64(owner);
401         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
402         lenum = cpu_to_le64(offset);
403         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
404 
405         return ((u64)high_crc << 31) ^ (u64)low_crc;
406 }
407 
408 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
409                                      struct btrfs_extent_data_ref *ref)
410 {
411         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
412                                     btrfs_extent_data_ref_objectid(leaf, ref),
413                                     btrfs_extent_data_ref_offset(leaf, ref));
414 }
415 
416 static int match_extent_data_ref(struct extent_buffer *leaf,
417                                  struct btrfs_extent_data_ref *ref,
418                                  u64 root_objectid, u64 owner, u64 offset)
419 {
420         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
421             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
422             btrfs_extent_data_ref_offset(leaf, ref) != offset)
423                 return 0;
424         return 1;
425 }
426 
427 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
428                                            struct btrfs_path *path,
429                                            u64 bytenr, u64 parent,
430                                            u64 root_objectid,
431                                            u64 owner, u64 offset)
432 {
433         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
434         struct btrfs_key key;
435         struct btrfs_extent_data_ref *ref;
436         struct extent_buffer *leaf;
437         u32 nritems;
438         int recow;
439         int ret;
440 
441         key.objectid = bytenr;
442         if (parent) {
443                 key.type = BTRFS_SHARED_DATA_REF_KEY;
444                 key.offset = parent;
445         } else {
446                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
447                 key.offset = hash_extent_data_ref(root_objectid,
448                                                   owner, offset);
449         }
450 again:
451         recow = 0;
452         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
453         if (ret < 0)
454                 return ret;
455 
456         if (parent) {
457                 if (ret)
458                         return -ENOENT;
459                 return 0;
460         }
461 
462         ret = -ENOENT;
463         leaf = path->nodes[0];
464         nritems = btrfs_header_nritems(leaf);
465         while (1) {
466                 if (path->slots[0] >= nritems) {
467                         ret = btrfs_next_leaf(root, path);
468                         if (ret) {
469                                 if (ret > 0)
470                                         return -ENOENT;
471                                 return ret;
472                         }
473 
474                         leaf = path->nodes[0];
475                         nritems = btrfs_header_nritems(leaf);
476                         recow = 1;
477                 }
478 
479                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
480                 if (key.objectid != bytenr ||
481                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
482                         goto fail;
483 
484                 ref = btrfs_item_ptr(leaf, path->slots[0],
485                                      struct btrfs_extent_data_ref);
486 
487                 if (match_extent_data_ref(leaf, ref, root_objectid,
488                                           owner, offset)) {
489                         if (recow) {
490                                 btrfs_release_path(path);
491                                 goto again;
492                         }
493                         ret = 0;
494                         break;
495                 }
496                 path->slots[0]++;
497         }
498 fail:
499         return ret;
500 }
501 
502 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
503                                            struct btrfs_path *path,
504                                            struct btrfs_delayed_ref_node *node,
505                                            u64 bytenr)
506 {
507         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
508         struct btrfs_key key;
509         struct extent_buffer *leaf;
510         u64 owner = btrfs_delayed_ref_owner(node);
511         u64 offset = btrfs_delayed_ref_offset(node);
512         u32 size;
513         u32 num_refs;
514         int ret;
515 
516         key.objectid = bytenr;
517         if (node->parent) {
518                 key.type = BTRFS_SHARED_DATA_REF_KEY;
519                 key.offset = node->parent;
520                 size = sizeof(struct btrfs_shared_data_ref);
521         } else {
522                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
523                 key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
524                 size = sizeof(struct btrfs_extent_data_ref);
525         }
526 
527         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
528         if (ret && ret != -EEXIST)
529                 goto fail;
530 
531         leaf = path->nodes[0];
532         if (node->parent) {
533                 struct btrfs_shared_data_ref *ref;
534                 ref = btrfs_item_ptr(leaf, path->slots[0],
535                                      struct btrfs_shared_data_ref);
536                 if (ret == 0) {
537                         btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
538                 } else {
539                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
540                         num_refs += node->ref_mod;
541                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
542                 }
543         } else {
544                 struct btrfs_extent_data_ref *ref;
545                 while (ret == -EEXIST) {
546                         ref = btrfs_item_ptr(leaf, path->slots[0],
547                                              struct btrfs_extent_data_ref);
548                         if (match_extent_data_ref(leaf, ref, node->ref_root,
549                                                   owner, offset))
550                                 break;
551                         btrfs_release_path(path);
552                         key.offset++;
553                         ret = btrfs_insert_empty_item(trans, root, path, &key,
554                                                       size);
555                         if (ret && ret != -EEXIST)
556                                 goto fail;
557 
558                         leaf = path->nodes[0];
559                 }
560                 ref = btrfs_item_ptr(leaf, path->slots[0],
561                                      struct btrfs_extent_data_ref);
562                 if (ret == 0) {
563                         btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
564                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
565                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
566                         btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
567                 } else {
568                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
569                         num_refs += node->ref_mod;
570                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
571                 }
572         }
573         btrfs_mark_buffer_dirty(trans, leaf);
574         ret = 0;
575 fail:
576         btrfs_release_path(path);
577         return ret;
578 }
579 
580 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
581                                            struct btrfs_root *root,
582                                            struct btrfs_path *path,
583                                            int refs_to_drop)
584 {
585         struct btrfs_key key;
586         struct btrfs_extent_data_ref *ref1 = NULL;
587         struct btrfs_shared_data_ref *ref2 = NULL;
588         struct extent_buffer *leaf;
589         u32 num_refs = 0;
590         int ret = 0;
591 
592         leaf = path->nodes[0];
593         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
594 
595         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
596                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
597                                       struct btrfs_extent_data_ref);
598                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
599         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
600                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
601                                       struct btrfs_shared_data_ref);
602                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
603         } else {
604                 btrfs_err(trans->fs_info,
605                           "unrecognized backref key (%llu %u %llu)",
606                           key.objectid, key.type, key.offset);
607                 btrfs_abort_transaction(trans, -EUCLEAN);
608                 return -EUCLEAN;
609         }
610 
611         BUG_ON(num_refs < refs_to_drop);
612         num_refs -= refs_to_drop;
613 
614         if (num_refs == 0) {
615                 ret = btrfs_del_item(trans, root, path);
616         } else {
617                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
618                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
619                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
620                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
621                 btrfs_mark_buffer_dirty(trans, leaf);
622         }
623         return ret;
624 }
625 
626 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
627                                           struct btrfs_extent_inline_ref *iref)
628 {
629         struct btrfs_key key;
630         struct extent_buffer *leaf;
631         struct btrfs_extent_data_ref *ref1;
632         struct btrfs_shared_data_ref *ref2;
633         u32 num_refs = 0;
634         int type;
635 
636         leaf = path->nodes[0];
637         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
638 
639         if (iref) {
640                 /*
641                  * If type is invalid, we should have bailed out earlier than
642                  * this call.
643                  */
644                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
645                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
646                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
647                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
648                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649                 } else {
650                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
651                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
652                 }
653         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
654                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
655                                       struct btrfs_extent_data_ref);
656                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
657         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
658                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
659                                       struct btrfs_shared_data_ref);
660                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
661         } else {
662                 WARN_ON(1);
663         }
664         return num_refs;
665 }
666 
667 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
668                                           struct btrfs_path *path,
669                                           u64 bytenr, u64 parent,
670                                           u64 root_objectid)
671 {
672         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
673         struct btrfs_key key;
674         int ret;
675 
676         key.objectid = bytenr;
677         if (parent) {
678                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
679                 key.offset = parent;
680         } else {
681                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
682                 key.offset = root_objectid;
683         }
684 
685         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
686         if (ret > 0)
687                 ret = -ENOENT;
688         return ret;
689 }
690 
691 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
692                                           struct btrfs_path *path,
693                                           struct btrfs_delayed_ref_node *node,
694                                           u64 bytenr)
695 {
696         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
697         struct btrfs_key key;
698         int ret;
699 
700         key.objectid = bytenr;
701         if (node->parent) {
702                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
703                 key.offset = node->parent;
704         } else {
705                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
706                 key.offset = node->ref_root;
707         }
708 
709         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
710         btrfs_release_path(path);
711         return ret;
712 }
713 
714 static inline int extent_ref_type(u64 parent, u64 owner)
715 {
716         int type;
717         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
718                 if (parent > 0)
719                         type = BTRFS_SHARED_BLOCK_REF_KEY;
720                 else
721                         type = BTRFS_TREE_BLOCK_REF_KEY;
722         } else {
723                 if (parent > 0)
724                         type = BTRFS_SHARED_DATA_REF_KEY;
725                 else
726                         type = BTRFS_EXTENT_DATA_REF_KEY;
727         }
728         return type;
729 }
730 
731 static int find_next_key(struct btrfs_path *path, int level,
732                          struct btrfs_key *key)
733 
734 {
735         for (; level < BTRFS_MAX_LEVEL; level++) {
736                 if (!path->nodes[level])
737                         break;
738                 if (path->slots[level] + 1 >=
739                     btrfs_header_nritems(path->nodes[level]))
740                         continue;
741                 if (level == 0)
742                         btrfs_item_key_to_cpu(path->nodes[level], key,
743                                               path->slots[level] + 1);
744                 else
745                         btrfs_node_key_to_cpu(path->nodes[level], key,
746                                               path->slots[level] + 1);
747                 return 0;
748         }
749         return 1;
750 }
751 
752 /*
753  * look for inline back ref. if back ref is found, *ref_ret is set
754  * to the address of inline back ref, and 0 is returned.
755  *
756  * if back ref isn't found, *ref_ret is set to the address where it
757  * should be inserted, and -ENOENT is returned.
758  *
759  * if insert is true and there are too many inline back refs, the path
760  * points to the extent item, and -EAGAIN is returned.
761  *
762  * NOTE: inline back refs are ordered in the same way that back ref
763  *       items in the tree are ordered.
764  */
765 static noinline_for_stack
766 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
767                                  struct btrfs_path *path,
768                                  struct btrfs_extent_inline_ref **ref_ret,
769                                  u64 bytenr, u64 num_bytes,
770                                  u64 parent, u64 root_objectid,
771                                  u64 owner, u64 offset, int insert)
772 {
773         struct btrfs_fs_info *fs_info = trans->fs_info;
774         struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
775         struct btrfs_key key;
776         struct extent_buffer *leaf;
777         struct btrfs_extent_item *ei;
778         struct btrfs_extent_inline_ref *iref;
779         u64 flags;
780         u64 item_size;
781         unsigned long ptr;
782         unsigned long end;
783         int extra_size;
784         int type;
785         int want;
786         int ret;
787         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
788         int needed;
789 
790         key.objectid = bytenr;
791         key.type = BTRFS_EXTENT_ITEM_KEY;
792         key.offset = num_bytes;
793 
794         want = extent_ref_type(parent, owner);
795         if (insert) {
796                 extra_size = btrfs_extent_inline_ref_size(want);
797                 path->search_for_extension = 1;
798                 path->keep_locks = 1;
799         } else
800                 extra_size = -1;
801 
802         /*
803          * Owner is our level, so we can just add one to get the level for the
804          * block we are interested in.
805          */
806         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
807                 key.type = BTRFS_METADATA_ITEM_KEY;
808                 key.offset = owner;
809         }
810 
811 again:
812         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
813         if (ret < 0)
814                 goto out;
815 
816         /*
817          * We may be a newly converted file system which still has the old fat
818          * extent entries for metadata, so try and see if we have one of those.
819          */
820         if (ret > 0 && skinny_metadata) {
821                 skinny_metadata = false;
822                 if (path->slots[0]) {
823                         path->slots[0]--;
824                         btrfs_item_key_to_cpu(path->nodes[0], &key,
825                                               path->slots[0]);
826                         if (key.objectid == bytenr &&
827                             key.type == BTRFS_EXTENT_ITEM_KEY &&
828                             key.offset == num_bytes)
829                                 ret = 0;
830                 }
831                 if (ret) {
832                         key.objectid = bytenr;
833                         key.type = BTRFS_EXTENT_ITEM_KEY;
834                         key.offset = num_bytes;
835                         btrfs_release_path(path);
836                         goto again;
837                 }
838         }
839 
840         if (ret && !insert) {
841                 ret = -ENOENT;
842                 goto out;
843         } else if (WARN_ON(ret)) {
844                 btrfs_print_leaf(path->nodes[0]);
845                 btrfs_err(fs_info,
846 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
847                           bytenr, num_bytes, parent, root_objectid, owner,
848                           offset);
849                 ret = -EUCLEAN;
850                 goto out;
851         }
852 
853         leaf = path->nodes[0];
854         item_size = btrfs_item_size(leaf, path->slots[0]);
855         if (unlikely(item_size < sizeof(*ei))) {
856                 ret = -EUCLEAN;
857                 btrfs_err(fs_info,
858                           "unexpected extent item size, has %llu expect >= %zu",
859                           item_size, sizeof(*ei));
860                 btrfs_abort_transaction(trans, ret);
861                 goto out;
862         }
863 
864         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
865         flags = btrfs_extent_flags(leaf, ei);
866 
867         ptr = (unsigned long)(ei + 1);
868         end = (unsigned long)ei + item_size;
869 
870         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
871                 ptr += sizeof(struct btrfs_tree_block_info);
872                 BUG_ON(ptr > end);
873         }
874 
875         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
876                 needed = BTRFS_REF_TYPE_DATA;
877         else
878                 needed = BTRFS_REF_TYPE_BLOCK;
879 
880         ret = -ENOENT;
881         while (ptr < end) {
882                 iref = (struct btrfs_extent_inline_ref *)ptr;
883                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
884                 if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
885                         ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
886                         ptr += btrfs_extent_inline_ref_size(type);
887                         continue;
888                 }
889                 if (type == BTRFS_REF_TYPE_INVALID) {
890                         ret = -EUCLEAN;
891                         goto out;
892                 }
893 
894                 if (want < type)
895                         break;
896                 if (want > type) {
897                         ptr += btrfs_extent_inline_ref_size(type);
898                         continue;
899                 }
900 
901                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
902                         struct btrfs_extent_data_ref *dref;
903                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
904                         if (match_extent_data_ref(leaf, dref, root_objectid,
905                                                   owner, offset)) {
906                                 ret = 0;
907                                 break;
908                         }
909                         if (hash_extent_data_ref_item(leaf, dref) <
910                             hash_extent_data_ref(root_objectid, owner, offset))
911                                 break;
912                 } else {
913                         u64 ref_offset;
914                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
915                         if (parent > 0) {
916                                 if (parent == ref_offset) {
917                                         ret = 0;
918                                         break;
919                                 }
920                                 if (ref_offset < parent)
921                                         break;
922                         } else {
923                                 if (root_objectid == ref_offset) {
924                                         ret = 0;
925                                         break;
926                                 }
927                                 if (ref_offset < root_objectid)
928                                         break;
929                         }
930                 }
931                 ptr += btrfs_extent_inline_ref_size(type);
932         }
933 
934         if (unlikely(ptr > end)) {
935                 ret = -EUCLEAN;
936                 btrfs_print_leaf(path->nodes[0]);
937                 btrfs_crit(fs_info,
938 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
939                            path->slots[0], root_objectid, owner, offset, parent);
940                 goto out;
941         }
942 
943         if (ret == -ENOENT && insert) {
944                 if (item_size + extra_size >=
945                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
946                         ret = -EAGAIN;
947                         goto out;
948                 }
949                 /*
950                  * To add new inline back ref, we have to make sure
951                  * there is no corresponding back ref item.
952                  * For simplicity, we just do not add new inline back
953                  * ref if there is any kind of item for this block
954                  */
955                 if (find_next_key(path, 0, &key) == 0 &&
956                     key.objectid == bytenr &&
957                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
958                         ret = -EAGAIN;
959                         goto out;
960                 }
961         }
962         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
963 out:
964         if (insert) {
965                 path->keep_locks = 0;
966                 path->search_for_extension = 0;
967                 btrfs_unlock_up_safe(path, 1);
968         }
969         return ret;
970 }
971 
972 /*
973  * helper to add new inline back ref
974  */
975 static noinline_for_stack
976 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
977                                  struct btrfs_path *path,
978                                  struct btrfs_extent_inline_ref *iref,
979                                  u64 parent, u64 root_objectid,
980                                  u64 owner, u64 offset, int refs_to_add,
981                                  struct btrfs_delayed_extent_op *extent_op)
982 {
983         struct extent_buffer *leaf;
984         struct btrfs_extent_item *ei;
985         unsigned long ptr;
986         unsigned long end;
987         unsigned long item_offset;
988         u64 refs;
989         int size;
990         int type;
991 
992         leaf = path->nodes[0];
993         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
994         item_offset = (unsigned long)iref - (unsigned long)ei;
995 
996         type = extent_ref_type(parent, owner);
997         size = btrfs_extent_inline_ref_size(type);
998 
999         btrfs_extend_item(trans, path, size);
1000 
1001         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002         refs = btrfs_extent_refs(leaf, ei);
1003         refs += refs_to_add;
1004         btrfs_set_extent_refs(leaf, ei, refs);
1005         if (extent_op)
1006                 __run_delayed_extent_op(extent_op, leaf, ei);
1007 
1008         ptr = (unsigned long)ei + item_offset;
1009         end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1010         if (ptr < end - size)
1011                 memmove_extent_buffer(leaf, ptr + size, ptr,
1012                                       end - size - ptr);
1013 
1014         iref = (struct btrfs_extent_inline_ref *)ptr;
1015         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1016         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1017                 struct btrfs_extent_data_ref *dref;
1018                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1019                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1020                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1021                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1022                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1023         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1024                 struct btrfs_shared_data_ref *sref;
1025                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1026                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1027                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1028         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1029                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030         } else {
1031                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1032         }
1033         btrfs_mark_buffer_dirty(trans, leaf);
1034 }
1035 
1036 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1037                                  struct btrfs_path *path,
1038                                  struct btrfs_extent_inline_ref **ref_ret,
1039                                  u64 bytenr, u64 num_bytes, u64 parent,
1040                                  u64 root_objectid, u64 owner, u64 offset)
1041 {
1042         int ret;
1043 
1044         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1045                                            num_bytes, parent, root_objectid,
1046                                            owner, offset, 0);
1047         if (ret != -ENOENT)
1048                 return ret;
1049 
1050         btrfs_release_path(path);
1051         *ref_ret = NULL;
1052 
1053         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1054                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1055                                             root_objectid);
1056         } else {
1057                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1058                                              root_objectid, owner, offset);
1059         }
1060         return ret;
1061 }
1062 
1063 /*
1064  * helper to update/remove inline back ref
1065  */
1066 static noinline_for_stack int update_inline_extent_backref(
1067                                   struct btrfs_trans_handle *trans,
1068                                   struct btrfs_path *path,
1069                                   struct btrfs_extent_inline_ref *iref,
1070                                   int refs_to_mod,
1071                                   struct btrfs_delayed_extent_op *extent_op)
1072 {
1073         struct extent_buffer *leaf = path->nodes[0];
1074         struct btrfs_fs_info *fs_info = leaf->fs_info;
1075         struct btrfs_extent_item *ei;
1076         struct btrfs_extent_data_ref *dref = NULL;
1077         struct btrfs_shared_data_ref *sref = NULL;
1078         unsigned long ptr;
1079         unsigned long end;
1080         u32 item_size;
1081         int size;
1082         int type;
1083         u64 refs;
1084 
1085         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1086         refs = btrfs_extent_refs(leaf, ei);
1087         if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1088                 struct btrfs_key key;
1089                 u32 extent_size;
1090 
1091                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1092                 if (key.type == BTRFS_METADATA_ITEM_KEY)
1093                         extent_size = fs_info->nodesize;
1094                 else
1095                         extent_size = key.offset;
1096                 btrfs_print_leaf(leaf);
1097                 btrfs_err(fs_info,
1098         "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1099                           key.objectid, extent_size, refs_to_mod, refs);
1100                 return -EUCLEAN;
1101         }
1102         refs += refs_to_mod;
1103         btrfs_set_extent_refs(leaf, ei, refs);
1104         if (extent_op)
1105                 __run_delayed_extent_op(extent_op, leaf, ei);
1106 
1107         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1108         /*
1109          * Function btrfs_get_extent_inline_ref_type() has already printed
1110          * error messages.
1111          */
1112         if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1113                 return -EUCLEAN;
1114 
1115         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1116                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1117                 refs = btrfs_extent_data_ref_count(leaf, dref);
1118         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1119                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1120                 refs = btrfs_shared_data_ref_count(leaf, sref);
1121         } else {
1122                 refs = 1;
1123                 /*
1124                  * For tree blocks we can only drop one ref for it, and tree
1125                  * blocks should not have refs > 1.
1126                  *
1127                  * Furthermore if we're inserting a new inline backref, we
1128                  * won't reach this path either. That would be
1129                  * setup_inline_extent_backref().
1130                  */
1131                 if (unlikely(refs_to_mod != -1)) {
1132                         struct btrfs_key key;
1133 
1134                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135 
1136                         btrfs_print_leaf(leaf);
1137                         btrfs_err(fs_info,
1138                         "invalid refs_to_mod for tree block %llu, has %d expect -1",
1139                                   key.objectid, refs_to_mod);
1140                         return -EUCLEAN;
1141                 }
1142         }
1143 
1144         if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1145                 struct btrfs_key key;
1146                 u32 extent_size;
1147 
1148                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1149                 if (key.type == BTRFS_METADATA_ITEM_KEY)
1150                         extent_size = fs_info->nodesize;
1151                 else
1152                         extent_size = key.offset;
1153                 btrfs_print_leaf(leaf);
1154                 btrfs_err(fs_info,
1155 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1156                           (unsigned long)iref, key.objectid, extent_size,
1157                           refs_to_mod, refs);
1158                 return -EUCLEAN;
1159         }
1160         refs += refs_to_mod;
1161 
1162         if (refs > 0) {
1163                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1164                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1165                 else
1166                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1167         } else {
1168                 size =  btrfs_extent_inline_ref_size(type);
1169                 item_size = btrfs_item_size(leaf, path->slots[0]);
1170                 ptr = (unsigned long)iref;
1171                 end = (unsigned long)ei + item_size;
1172                 if (ptr + size < end)
1173                         memmove_extent_buffer(leaf, ptr, ptr + size,
1174                                               end - ptr - size);
1175                 item_size -= size;
1176                 btrfs_truncate_item(trans, path, item_size, 1);
1177         }
1178         btrfs_mark_buffer_dirty(trans, leaf);
1179         return 0;
1180 }
1181 
1182 static noinline_for_stack
1183 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1184                                  struct btrfs_path *path,
1185                                  u64 bytenr, u64 num_bytes, u64 parent,
1186                                  u64 root_objectid, u64 owner,
1187                                  u64 offset, int refs_to_add,
1188                                  struct btrfs_delayed_extent_op *extent_op)
1189 {
1190         struct btrfs_extent_inline_ref *iref;
1191         int ret;
1192 
1193         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1194                                            num_bytes, parent, root_objectid,
1195                                            owner, offset, 1);
1196         if (ret == 0) {
1197                 /*
1198                  * We're adding refs to a tree block we already own, this
1199                  * should not happen at all.
1200                  */
1201                 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1202                         btrfs_print_leaf(path->nodes[0]);
1203                         btrfs_crit(trans->fs_info,
1204 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1205                                    bytenr, num_bytes, root_objectid, path->slots[0]);
1206                         return -EUCLEAN;
1207                 }
1208                 ret = update_inline_extent_backref(trans, path, iref,
1209                                                    refs_to_add, extent_op);
1210         } else if (ret == -ENOENT) {
1211                 setup_inline_extent_backref(trans, path, iref, parent,
1212                                             root_objectid, owner, offset,
1213                                             refs_to_add, extent_op);
1214                 ret = 0;
1215         }
1216         return ret;
1217 }
1218 
1219 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1220                                  struct btrfs_root *root,
1221                                  struct btrfs_path *path,
1222                                  struct btrfs_extent_inline_ref *iref,
1223                                  int refs_to_drop, int is_data)
1224 {
1225         int ret = 0;
1226 
1227         BUG_ON(!is_data && refs_to_drop != 1);
1228         if (iref)
1229                 ret = update_inline_extent_backref(trans, path, iref,
1230                                                    -refs_to_drop, NULL);
1231         else if (is_data)
1232                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1233         else
1234                 ret = btrfs_del_item(trans, root, path);
1235         return ret;
1236 }
1237 
1238 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1239                                u64 *discarded_bytes)
1240 {
1241         int j, ret = 0;
1242         u64 bytes_left, end;
1243         u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1244 
1245         /* Adjust the range to be aligned to 512B sectors if necessary. */
1246         if (start != aligned_start) {
1247                 len -= aligned_start - start;
1248                 len = round_down(len, 1 << SECTOR_SHIFT);
1249                 start = aligned_start;
1250         }
1251 
1252         *discarded_bytes = 0;
1253 
1254         if (!len)
1255                 return 0;
1256 
1257         end = start + len;
1258         bytes_left = len;
1259 
1260         /* Skip any superblocks on this device. */
1261         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1262                 u64 sb_start = btrfs_sb_offset(j);
1263                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1264                 u64 size = sb_start - start;
1265 
1266                 if (!in_range(sb_start, start, bytes_left) &&
1267                     !in_range(sb_end, start, bytes_left) &&
1268                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1269                         continue;
1270 
1271                 /*
1272                  * Superblock spans beginning of range.  Adjust start and
1273                  * try again.
1274                  */
1275                 if (sb_start <= start) {
1276                         start += sb_end - start;
1277                         if (start > end) {
1278                                 bytes_left = 0;
1279                                 break;
1280                         }
1281                         bytes_left = end - start;
1282                         continue;
1283                 }
1284 
1285                 if (size) {
1286                         ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1287                                                    size >> SECTOR_SHIFT,
1288                                                    GFP_NOFS);
1289                         if (!ret)
1290                                 *discarded_bytes += size;
1291                         else if (ret != -EOPNOTSUPP)
1292                                 return ret;
1293                 }
1294 
1295                 start = sb_end;
1296                 if (start > end) {
1297                         bytes_left = 0;
1298                         break;
1299                 }
1300                 bytes_left = end - start;
1301         }
1302 
1303         if (bytes_left) {
1304                 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1305                                            bytes_left >> SECTOR_SHIFT,
1306                                            GFP_NOFS);
1307                 if (!ret)
1308                         *discarded_bytes += bytes_left;
1309         }
1310         return ret;
1311 }
1312 
1313 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1314 {
1315         struct btrfs_device *dev = stripe->dev;
1316         struct btrfs_fs_info *fs_info = dev->fs_info;
1317         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1318         u64 phys = stripe->physical;
1319         u64 len = stripe->length;
1320         u64 discarded = 0;
1321         int ret = 0;
1322 
1323         /* Zone reset on a zoned filesystem */
1324         if (btrfs_can_zone_reset(dev, phys, len)) {
1325                 u64 src_disc;
1326 
1327                 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1328                 if (ret)
1329                         goto out;
1330 
1331                 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1332                     dev != dev_replace->srcdev)
1333                         goto out;
1334 
1335                 src_disc = discarded;
1336 
1337                 /* Send to replace target as well */
1338                 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1339                                               &discarded);
1340                 discarded += src_disc;
1341         } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1342                 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1343         } else {
1344                 ret = 0;
1345                 *bytes = 0;
1346         }
1347 
1348 out:
1349         *bytes = discarded;
1350         return ret;
1351 }
1352 
1353 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1354                          u64 num_bytes, u64 *actual_bytes)
1355 {
1356         int ret = 0;
1357         u64 discarded_bytes = 0;
1358         u64 end = bytenr + num_bytes;
1359         u64 cur = bytenr;
1360 
1361         /*
1362          * Avoid races with device replace and make sure the devices in the
1363          * stripes don't go away while we are discarding.
1364          */
1365         btrfs_bio_counter_inc_blocked(fs_info);
1366         while (cur < end) {
1367                 struct btrfs_discard_stripe *stripes;
1368                 unsigned int num_stripes;
1369                 int i;
1370 
1371                 num_bytes = end - cur;
1372                 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1373                 if (IS_ERR(stripes)) {
1374                         ret = PTR_ERR(stripes);
1375                         if (ret == -EOPNOTSUPP)
1376                                 ret = 0;
1377                         break;
1378                 }
1379 
1380                 for (i = 0; i < num_stripes; i++) {
1381                         struct btrfs_discard_stripe *stripe = stripes + i;
1382                         u64 bytes;
1383 
1384                         if (!stripe->dev->bdev) {
1385                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1386                                 continue;
1387                         }
1388 
1389                         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1390                                         &stripe->dev->dev_state))
1391                                 continue;
1392 
1393                         ret = do_discard_extent(stripe, &bytes);
1394                         if (ret) {
1395                                 /*
1396                                  * Keep going if discard is not supported by the
1397                                  * device.
1398                                  */
1399                                 if (ret != -EOPNOTSUPP)
1400                                         break;
1401                                 ret = 0;
1402                         } else {
1403                                 discarded_bytes += bytes;
1404                         }
1405                 }
1406                 kfree(stripes);
1407                 if (ret)
1408                         break;
1409                 cur += num_bytes;
1410         }
1411         btrfs_bio_counter_dec(fs_info);
1412         if (actual_bytes)
1413                 *actual_bytes = discarded_bytes;
1414         return ret;
1415 }
1416 
1417 /* Can return -ENOMEM */
1418 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1419                          struct btrfs_ref *generic_ref)
1420 {
1421         struct btrfs_fs_info *fs_info = trans->fs_info;
1422         int ret;
1423 
1424         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1425                generic_ref->action);
1426         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1427                generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
1428 
1429         if (generic_ref->type == BTRFS_REF_METADATA)
1430                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1431         else
1432                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1433 
1434         btrfs_ref_tree_mod(fs_info, generic_ref);
1435 
1436         return ret;
1437 }
1438 
1439 /*
1440  * Insert backreference for a given extent.
1441  *
1442  * The counterpart is in __btrfs_free_extent(), with examples and more details
1443  * how it works.
1444  *
1445  * @trans:          Handle of transaction
1446  *
1447  * @node:           The delayed ref node used to get the bytenr/length for
1448  *                  extent whose references are incremented.
1449  *
1450  * @extent_op       Pointer to a structure, holding information necessary when
1451  *                  updating a tree block's flags
1452  *
1453  */
1454 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1455                                   struct btrfs_delayed_ref_node *node,
1456                                   struct btrfs_delayed_extent_op *extent_op)
1457 {
1458         struct btrfs_path *path;
1459         struct extent_buffer *leaf;
1460         struct btrfs_extent_item *item;
1461         struct btrfs_key key;
1462         u64 bytenr = node->bytenr;
1463         u64 num_bytes = node->num_bytes;
1464         u64 owner = btrfs_delayed_ref_owner(node);
1465         u64 offset = btrfs_delayed_ref_offset(node);
1466         u64 refs;
1467         int refs_to_add = node->ref_mod;
1468         int ret;
1469 
1470         path = btrfs_alloc_path();
1471         if (!path)
1472                 return -ENOMEM;
1473 
1474         /* this will setup the path even if it fails to insert the back ref */
1475         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1476                                            node->parent, node->ref_root, owner,
1477                                            offset, refs_to_add, extent_op);
1478         if ((ret < 0 && ret != -EAGAIN) || !ret)
1479                 goto out;
1480 
1481         /*
1482          * Ok we had -EAGAIN which means we didn't have space to insert and
1483          * inline extent ref, so just update the reference count and add a
1484          * normal backref.
1485          */
1486         leaf = path->nodes[0];
1487         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1488         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1489         refs = btrfs_extent_refs(leaf, item);
1490         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1491         if (extent_op)
1492                 __run_delayed_extent_op(extent_op, leaf, item);
1493 
1494         btrfs_mark_buffer_dirty(trans, leaf);
1495         btrfs_release_path(path);
1496 
1497         /* now insert the actual backref */
1498         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1499                 ret = insert_tree_block_ref(trans, path, node, bytenr);
1500         else
1501                 ret = insert_extent_data_ref(trans, path, node, bytenr);
1502 
1503         if (ret)
1504                 btrfs_abort_transaction(trans, ret);
1505 out:
1506         btrfs_free_path(path);
1507         return ret;
1508 }
1509 
1510 static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1511                                      struct btrfs_delayed_ref_head *href)
1512 {
1513         u64 root = href->owning_root;
1514 
1515         /*
1516          * Don't check must_insert_reserved, as this is called from contexts
1517          * where it has already been unset.
1518          */
1519         if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1520             !href->is_data || !is_fstree(root))
1521                 return;
1522 
1523         btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1524                                   BTRFS_QGROUP_RSV_DATA);
1525 }
1526 
1527 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1528                                 struct btrfs_delayed_ref_head *href,
1529                                 struct btrfs_delayed_ref_node *node,
1530                                 struct btrfs_delayed_extent_op *extent_op,
1531                                 bool insert_reserved)
1532 {
1533         int ret = 0;
1534         u64 parent = 0;
1535         u64 flags = 0;
1536 
1537         trace_run_delayed_data_ref(trans->fs_info, node);
1538 
1539         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1540                 parent = node->parent;
1541 
1542         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1543                 struct btrfs_key key;
1544                 struct btrfs_squota_delta delta = {
1545                         .root = href->owning_root,
1546                         .num_bytes = node->num_bytes,
1547                         .is_data = true,
1548                         .is_inc = true,
1549                         .generation = trans->transid,
1550                 };
1551                 u64 owner = btrfs_delayed_ref_owner(node);
1552                 u64 offset = btrfs_delayed_ref_offset(node);
1553 
1554                 if (extent_op)
1555                         flags |= extent_op->flags_to_set;
1556 
1557                 key.objectid = node->bytenr;
1558                 key.type = BTRFS_EXTENT_ITEM_KEY;
1559                 key.offset = node->num_bytes;
1560 
1561                 ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
1562                                                  flags, owner, offset, &key,
1563                                                  node->ref_mod,
1564                                                  href->owning_root);
1565                 free_head_ref_squota_rsv(trans->fs_info, href);
1566                 if (!ret)
1567                         ret = btrfs_record_squota_delta(trans->fs_info, &delta);
1568         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1569                 ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1570         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1571                 ret = __btrfs_free_extent(trans, href, node, extent_op);
1572         } else {
1573                 BUG();
1574         }
1575         return ret;
1576 }
1577 
1578 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1579                                     struct extent_buffer *leaf,
1580                                     struct btrfs_extent_item *ei)
1581 {
1582         u64 flags = btrfs_extent_flags(leaf, ei);
1583         if (extent_op->update_flags) {
1584                 flags |= extent_op->flags_to_set;
1585                 btrfs_set_extent_flags(leaf, ei, flags);
1586         }
1587 
1588         if (extent_op->update_key) {
1589                 struct btrfs_tree_block_info *bi;
1590                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1591                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1592                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1593         }
1594 }
1595 
1596 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1597                                  struct btrfs_delayed_ref_head *head,
1598                                  struct btrfs_delayed_extent_op *extent_op)
1599 {
1600         struct btrfs_fs_info *fs_info = trans->fs_info;
1601         struct btrfs_root *root;
1602         struct btrfs_key key;
1603         struct btrfs_path *path;
1604         struct btrfs_extent_item *ei;
1605         struct extent_buffer *leaf;
1606         u32 item_size;
1607         int ret;
1608         int metadata = 1;
1609 
1610         if (TRANS_ABORTED(trans))
1611                 return 0;
1612 
1613         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1614                 metadata = 0;
1615 
1616         path = btrfs_alloc_path();
1617         if (!path)
1618                 return -ENOMEM;
1619 
1620         key.objectid = head->bytenr;
1621 
1622         if (metadata) {
1623                 key.type = BTRFS_METADATA_ITEM_KEY;
1624                 key.offset = head->level;
1625         } else {
1626                 key.type = BTRFS_EXTENT_ITEM_KEY;
1627                 key.offset = head->num_bytes;
1628         }
1629 
1630         root = btrfs_extent_root(fs_info, key.objectid);
1631 again:
1632         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1633         if (ret < 0) {
1634                 goto out;
1635         } else if (ret > 0) {
1636                 if (metadata) {
1637                         if (path->slots[0] > 0) {
1638                                 path->slots[0]--;
1639                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1640                                                       path->slots[0]);
1641                                 if (key.objectid == head->bytenr &&
1642                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
1643                                     key.offset == head->num_bytes)
1644                                         ret = 0;
1645                         }
1646                         if (ret > 0) {
1647                                 btrfs_release_path(path);
1648                                 metadata = 0;
1649 
1650                                 key.objectid = head->bytenr;
1651                                 key.offset = head->num_bytes;
1652                                 key.type = BTRFS_EXTENT_ITEM_KEY;
1653                                 goto again;
1654                         }
1655                 } else {
1656                         ret = -EUCLEAN;
1657                         btrfs_err(fs_info,
1658                   "missing extent item for extent %llu num_bytes %llu level %d",
1659                                   head->bytenr, head->num_bytes, head->level);
1660                         goto out;
1661                 }
1662         }
1663 
1664         leaf = path->nodes[0];
1665         item_size = btrfs_item_size(leaf, path->slots[0]);
1666 
1667         if (unlikely(item_size < sizeof(*ei))) {
1668                 ret = -EUCLEAN;
1669                 btrfs_err(fs_info,
1670                           "unexpected extent item size, has %u expect >= %zu",
1671                           item_size, sizeof(*ei));
1672                 btrfs_abort_transaction(trans, ret);
1673                 goto out;
1674         }
1675 
1676         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1677         __run_delayed_extent_op(extent_op, leaf, ei);
1678 
1679         btrfs_mark_buffer_dirty(trans, leaf);
1680 out:
1681         btrfs_free_path(path);
1682         return ret;
1683 }
1684 
1685 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1686                                 struct btrfs_delayed_ref_head *href,
1687                                 struct btrfs_delayed_ref_node *node,
1688                                 struct btrfs_delayed_extent_op *extent_op,
1689                                 bool insert_reserved)
1690 {
1691         int ret = 0;
1692         struct btrfs_fs_info *fs_info = trans->fs_info;
1693         u64 parent = 0;
1694         u64 ref_root = 0;
1695 
1696         trace_run_delayed_tree_ref(trans->fs_info, node);
1697 
1698         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1699                 parent = node->parent;
1700         ref_root = node->ref_root;
1701 
1702         if (unlikely(node->ref_mod != 1)) {
1703                 btrfs_err(trans->fs_info,
1704         "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1705                           node->bytenr, node->ref_mod, node->action, ref_root,
1706                           parent);
1707                 return -EUCLEAN;
1708         }
1709         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1710                 struct btrfs_squota_delta delta = {
1711                         .root = href->owning_root,
1712                         .num_bytes = fs_info->nodesize,
1713                         .is_data = false,
1714                         .is_inc = true,
1715                         .generation = trans->transid,
1716                 };
1717 
1718                 ret = alloc_reserved_tree_block(trans, node, extent_op);
1719                 if (!ret)
1720                         btrfs_record_squota_delta(fs_info, &delta);
1721         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1722                 ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1723         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1724                 ret = __btrfs_free_extent(trans, href, node, extent_op);
1725         } else {
1726                 BUG();
1727         }
1728         return ret;
1729 }
1730 
1731 /* helper function to actually process a single delayed ref entry */
1732 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1733                                struct btrfs_delayed_ref_head *href,
1734                                struct btrfs_delayed_ref_node *node,
1735                                struct btrfs_delayed_extent_op *extent_op,
1736                                bool insert_reserved)
1737 {
1738         int ret = 0;
1739 
1740         if (TRANS_ABORTED(trans)) {
1741                 if (insert_reserved) {
1742                         btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1743                         free_head_ref_squota_rsv(trans->fs_info, href);
1744                 }
1745                 return 0;
1746         }
1747 
1748         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1749             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1750                 ret = run_delayed_tree_ref(trans, href, node, extent_op,
1751                                            insert_reserved);
1752         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1753                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1754                 ret = run_delayed_data_ref(trans, href, node, extent_op,
1755                                            insert_reserved);
1756         else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1757                 ret = 0;
1758         else
1759                 BUG();
1760         if (ret && insert_reserved)
1761                 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1762         if (ret < 0)
1763                 btrfs_err(trans->fs_info,
1764 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1765                           node->bytenr, node->num_bytes, node->type,
1766                           node->action, node->ref_mod, ret);
1767         return ret;
1768 }
1769 
1770 static inline struct btrfs_delayed_ref_node *
1771 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1772 {
1773         struct btrfs_delayed_ref_node *ref;
1774 
1775         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1776                 return NULL;
1777 
1778         /*
1779          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1780          * This is to prevent a ref count from going down to zero, which deletes
1781          * the extent item from the extent tree, when there still are references
1782          * to add, which would fail because they would not find the extent item.
1783          */
1784         if (!list_empty(&head->ref_add_list))
1785                 return list_first_entry(&head->ref_add_list,
1786                                 struct btrfs_delayed_ref_node, add_list);
1787 
1788         ref = rb_entry(rb_first_cached(&head->ref_tree),
1789                        struct btrfs_delayed_ref_node, ref_node);
1790         ASSERT(list_empty(&ref->add_list));
1791         return ref;
1792 }
1793 
1794 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1795                                       struct btrfs_delayed_ref_head *head)
1796 {
1797         spin_lock(&delayed_refs->lock);
1798         head->processing = false;
1799         delayed_refs->num_heads_ready++;
1800         spin_unlock(&delayed_refs->lock);
1801         btrfs_delayed_ref_unlock(head);
1802 }
1803 
1804 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1805                                 struct btrfs_delayed_ref_head *head)
1806 {
1807         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1808 
1809         if (!extent_op)
1810                 return NULL;
1811 
1812         if (head->must_insert_reserved) {
1813                 head->extent_op = NULL;
1814                 btrfs_free_delayed_extent_op(extent_op);
1815                 return NULL;
1816         }
1817         return extent_op;
1818 }
1819 
1820 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1821                                      struct btrfs_delayed_ref_head *head)
1822 {
1823         struct btrfs_delayed_extent_op *extent_op;
1824         int ret;
1825 
1826         extent_op = cleanup_extent_op(head);
1827         if (!extent_op)
1828                 return 0;
1829         head->extent_op = NULL;
1830         spin_unlock(&head->lock);
1831         ret = run_delayed_extent_op(trans, head, extent_op);
1832         btrfs_free_delayed_extent_op(extent_op);
1833         return ret ? ret : 1;
1834 }
1835 
1836 u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1837                                   struct btrfs_delayed_ref_root *delayed_refs,
1838                                   struct btrfs_delayed_ref_head *head)
1839 {
1840         u64 ret = 0;
1841 
1842         /*
1843          * We had csum deletions accounted for in our delayed refs rsv, we need
1844          * to drop the csum leaves for this update from our delayed_refs_rsv.
1845          */
1846         if (head->total_ref_mod < 0 && head->is_data) {
1847                 int nr_csums;
1848 
1849                 spin_lock(&delayed_refs->lock);
1850                 delayed_refs->pending_csums -= head->num_bytes;
1851                 spin_unlock(&delayed_refs->lock);
1852                 nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1853 
1854                 btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
1855 
1856                 ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
1857         }
1858         /* must_insert_reserved can be set only if we didn't run the head ref. */
1859         if (head->must_insert_reserved)
1860                 free_head_ref_squota_rsv(fs_info, head);
1861 
1862         return ret;
1863 }
1864 
1865 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1866                             struct btrfs_delayed_ref_head *head,
1867                             u64 *bytes_released)
1868 {
1869 
1870         struct btrfs_fs_info *fs_info = trans->fs_info;
1871         struct btrfs_delayed_ref_root *delayed_refs;
1872         int ret;
1873 
1874         delayed_refs = &trans->transaction->delayed_refs;
1875 
1876         ret = run_and_cleanup_extent_op(trans, head);
1877         if (ret < 0) {
1878                 unselect_delayed_ref_head(delayed_refs, head);
1879                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1880                 return ret;
1881         } else if (ret) {
1882                 return ret;
1883         }
1884 
1885         /*
1886          * Need to drop our head ref lock and re-acquire the delayed ref lock
1887          * and then re-check to make sure nobody got added.
1888          */
1889         spin_unlock(&head->lock);
1890         spin_lock(&delayed_refs->lock);
1891         spin_lock(&head->lock);
1892         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1893                 spin_unlock(&head->lock);
1894                 spin_unlock(&delayed_refs->lock);
1895                 return 1;
1896         }
1897         btrfs_delete_ref_head(delayed_refs, head);
1898         spin_unlock(&head->lock);
1899         spin_unlock(&delayed_refs->lock);
1900 
1901         if (head->must_insert_reserved) {
1902                 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1903                 if (head->is_data) {
1904                         struct btrfs_root *csum_root;
1905 
1906                         csum_root = btrfs_csum_root(fs_info, head->bytenr);
1907                         ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1908                                               head->num_bytes);
1909                 }
1910         }
1911 
1912         *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1913 
1914         trace_run_delayed_ref_head(fs_info, head, 0);
1915         btrfs_delayed_ref_unlock(head);
1916         btrfs_put_delayed_ref_head(head);
1917         return ret;
1918 }
1919 
1920 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1921                                         struct btrfs_trans_handle *trans)
1922 {
1923         struct btrfs_delayed_ref_root *delayed_refs =
1924                 &trans->transaction->delayed_refs;
1925         struct btrfs_delayed_ref_head *head = NULL;
1926         int ret;
1927 
1928         spin_lock(&delayed_refs->lock);
1929         head = btrfs_select_ref_head(delayed_refs);
1930         if (!head) {
1931                 spin_unlock(&delayed_refs->lock);
1932                 return head;
1933         }
1934 
1935         /*
1936          * Grab the lock that says we are going to process all the refs for
1937          * this head
1938          */
1939         ret = btrfs_delayed_ref_lock(delayed_refs, head);
1940         spin_unlock(&delayed_refs->lock);
1941 
1942         /*
1943          * We may have dropped the spin lock to get the head mutex lock, and
1944          * that might have given someone else time to free the head.  If that's
1945          * true, it has been removed from our list and we can move on.
1946          */
1947         if (ret == -EAGAIN)
1948                 head = ERR_PTR(-EAGAIN);
1949 
1950         return head;
1951 }
1952 
1953 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1954                                            struct btrfs_delayed_ref_head *locked_ref,
1955                                            u64 *bytes_released)
1956 {
1957         struct btrfs_fs_info *fs_info = trans->fs_info;
1958         struct btrfs_delayed_ref_root *delayed_refs;
1959         struct btrfs_delayed_extent_op *extent_op;
1960         struct btrfs_delayed_ref_node *ref;
1961         bool must_insert_reserved;
1962         int ret;
1963 
1964         delayed_refs = &trans->transaction->delayed_refs;
1965 
1966         lockdep_assert_held(&locked_ref->mutex);
1967         lockdep_assert_held(&locked_ref->lock);
1968 
1969         while ((ref = select_delayed_ref(locked_ref))) {
1970                 if (ref->seq &&
1971                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
1972                         spin_unlock(&locked_ref->lock);
1973                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1974                         return -EAGAIN;
1975                 }
1976 
1977                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1978                 RB_CLEAR_NODE(&ref->ref_node);
1979                 if (!list_empty(&ref->add_list))
1980                         list_del(&ref->add_list);
1981                 /*
1982                  * When we play the delayed ref, also correct the ref_mod on
1983                  * head
1984                  */
1985                 switch (ref->action) {
1986                 case BTRFS_ADD_DELAYED_REF:
1987                 case BTRFS_ADD_DELAYED_EXTENT:
1988                         locked_ref->ref_mod -= ref->ref_mod;
1989                         break;
1990                 case BTRFS_DROP_DELAYED_REF:
1991                         locked_ref->ref_mod += ref->ref_mod;
1992                         break;
1993                 default:
1994                         WARN_ON(1);
1995                 }
1996                 atomic_dec(&delayed_refs->num_entries);
1997 
1998                 /*
1999                  * Record the must_insert_reserved flag before we drop the
2000                  * spin lock.
2001                  */
2002                 must_insert_reserved = locked_ref->must_insert_reserved;
2003                 /*
2004                  * Unsetting this on the head ref relinquishes ownership of
2005                  * the rsv_bytes, so it is critical that every possible code
2006                  * path from here forward frees all reserves including qgroup
2007                  * reserve.
2008                  */
2009                 locked_ref->must_insert_reserved = false;
2010 
2011                 extent_op = locked_ref->extent_op;
2012                 locked_ref->extent_op = NULL;
2013                 spin_unlock(&locked_ref->lock);
2014 
2015                 ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
2016                                           must_insert_reserved);
2017                 btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
2018                 *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
2019 
2020                 btrfs_free_delayed_extent_op(extent_op);
2021                 if (ret) {
2022                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2023                         btrfs_put_delayed_ref(ref);
2024                         return ret;
2025                 }
2026 
2027                 btrfs_put_delayed_ref(ref);
2028                 cond_resched();
2029 
2030                 spin_lock(&locked_ref->lock);
2031                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2032         }
2033 
2034         return 0;
2035 }
2036 
2037 /*
2038  * Returns 0 on success or if called with an already aborted transaction.
2039  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2040  */
2041 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2042                                              u64 min_bytes)
2043 {
2044         struct btrfs_fs_info *fs_info = trans->fs_info;
2045         struct btrfs_delayed_ref_root *delayed_refs;
2046         struct btrfs_delayed_ref_head *locked_ref = NULL;
2047         int ret;
2048         unsigned long count = 0;
2049         unsigned long max_count = 0;
2050         u64 bytes_processed = 0;
2051 
2052         delayed_refs = &trans->transaction->delayed_refs;
2053         if (min_bytes == 0) {
2054                 max_count = delayed_refs->num_heads_ready;
2055                 min_bytes = U64_MAX;
2056         }
2057 
2058         do {
2059                 if (!locked_ref) {
2060                         locked_ref = btrfs_obtain_ref_head(trans);
2061                         if (IS_ERR_OR_NULL(locked_ref)) {
2062                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2063                                         continue;
2064                                 } else {
2065                                         break;
2066                                 }
2067                         }
2068                         count++;
2069                 }
2070                 /*
2071                  * We need to try and merge add/drops of the same ref since we
2072                  * can run into issues with relocate dropping the implicit ref
2073                  * and then it being added back again before the drop can
2074                  * finish.  If we merged anything we need to re-loop so we can
2075                  * get a good ref.
2076                  * Or we can get node references of the same type that weren't
2077                  * merged when created due to bumps in the tree mod seq, and
2078                  * we need to merge them to prevent adding an inline extent
2079                  * backref before dropping it (triggering a BUG_ON at
2080                  * insert_inline_extent_backref()).
2081                  */
2082                 spin_lock(&locked_ref->lock);
2083                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2084 
2085                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
2086                 if (ret < 0 && ret != -EAGAIN) {
2087                         /*
2088                          * Error, btrfs_run_delayed_refs_for_head already
2089                          * unlocked everything so just bail out
2090                          */
2091                         return ret;
2092                 } else if (!ret) {
2093                         /*
2094                          * Success, perform the usual cleanup of a processed
2095                          * head
2096                          */
2097                         ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
2098                         if (ret > 0 ) {
2099                                 /* We dropped our lock, we need to loop. */
2100                                 ret = 0;
2101                                 continue;
2102                         } else if (ret) {
2103                                 return ret;
2104                         }
2105                 }
2106 
2107                 /*
2108                  * Either success case or btrfs_run_delayed_refs_for_head
2109                  * returned -EAGAIN, meaning we need to select another head
2110                  */
2111 
2112                 locked_ref = NULL;
2113                 cond_resched();
2114         } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2115                  (max_count > 0 && count < max_count) ||
2116                  locked_ref);
2117 
2118         return 0;
2119 }
2120 
2121 #ifdef SCRAMBLE_DELAYED_REFS
2122 /*
2123  * Normally delayed refs get processed in ascending bytenr order. This
2124  * correlates in most cases to the order added. To expose dependencies on this
2125  * order, we start to process the tree in the middle instead of the beginning
2126  */
2127 static u64 find_middle(struct rb_root *root)
2128 {
2129         struct rb_node *n = root->rb_node;
2130         struct btrfs_delayed_ref_node *entry;
2131         int alt = 1;
2132         u64 middle;
2133         u64 first = 0, last = 0;
2134 
2135         n = rb_first(root);
2136         if (n) {
2137                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2138                 first = entry->bytenr;
2139         }
2140         n = rb_last(root);
2141         if (n) {
2142                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2143                 last = entry->bytenr;
2144         }
2145         n = root->rb_node;
2146 
2147         while (n) {
2148                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2149                 WARN_ON(!entry->in_tree);
2150 
2151                 middle = entry->bytenr;
2152 
2153                 if (alt)
2154                         n = n->rb_left;
2155                 else
2156                         n = n->rb_right;
2157 
2158                 alt = 1 - alt;
2159         }
2160         return middle;
2161 }
2162 #endif
2163 
2164 /*
2165  * Start processing the delayed reference count updates and extent insertions
2166  * we have queued up so far.
2167  *
2168  * @trans:      Transaction handle.
2169  * @min_bytes:  How many bytes of delayed references to process. After this
2170  *              many bytes we stop processing delayed references if there are
2171  *              any more. If 0 it means to run all existing delayed references,
2172  *              but not new ones added after running all existing ones.
2173  *              Use (u64)-1 (U64_MAX) to run all existing delayed references
2174  *              plus any new ones that are added.
2175  *
2176  * Returns 0 on success or if called with an aborted transaction
2177  * Returns <0 on error and aborts the transaction
2178  */
2179 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
2180 {
2181         struct btrfs_fs_info *fs_info = trans->fs_info;
2182         struct btrfs_delayed_ref_root *delayed_refs;
2183         int ret;
2184 
2185         /* We'll clean this up in btrfs_cleanup_transaction */
2186         if (TRANS_ABORTED(trans))
2187                 return 0;
2188 
2189         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2190                 return 0;
2191 
2192         delayed_refs = &trans->transaction->delayed_refs;
2193 again:
2194 #ifdef SCRAMBLE_DELAYED_REFS
2195         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2196 #endif
2197         ret = __btrfs_run_delayed_refs(trans, min_bytes);
2198         if (ret < 0) {
2199                 btrfs_abort_transaction(trans, ret);
2200                 return ret;
2201         }
2202 
2203         if (min_bytes == U64_MAX) {
2204                 btrfs_create_pending_block_groups(trans);
2205 
2206                 spin_lock(&delayed_refs->lock);
2207                 if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
2208                         spin_unlock(&delayed_refs->lock);
2209                         return 0;
2210                 }
2211                 spin_unlock(&delayed_refs->lock);
2212 
2213                 cond_resched();
2214                 goto again;
2215         }
2216 
2217         return 0;
2218 }
2219 
2220 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2221                                 struct extent_buffer *eb, u64 flags)
2222 {
2223         struct btrfs_delayed_extent_op *extent_op;
2224         int ret;
2225 
2226         extent_op = btrfs_alloc_delayed_extent_op();
2227         if (!extent_op)
2228                 return -ENOMEM;
2229 
2230         extent_op->flags_to_set = flags;
2231         extent_op->update_flags = true;
2232         extent_op->update_key = false;
2233 
2234         ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
2235                                           btrfs_header_level(eb), extent_op);
2236         if (ret)
2237                 btrfs_free_delayed_extent_op(extent_op);
2238         return ret;
2239 }
2240 
2241 static noinline int check_delayed_ref(struct btrfs_root *root,
2242                                       struct btrfs_path *path,
2243                                       u64 objectid, u64 offset, u64 bytenr)
2244 {
2245         struct btrfs_delayed_ref_head *head;
2246         struct btrfs_delayed_ref_node *ref;
2247         struct btrfs_delayed_ref_root *delayed_refs;
2248         struct btrfs_transaction *cur_trans;
2249         struct rb_node *node;
2250         int ret = 0;
2251 
2252         spin_lock(&root->fs_info->trans_lock);
2253         cur_trans = root->fs_info->running_transaction;
2254         if (cur_trans)
2255                 refcount_inc(&cur_trans->use_count);
2256         spin_unlock(&root->fs_info->trans_lock);
2257         if (!cur_trans)
2258                 return 0;
2259 
2260         delayed_refs = &cur_trans->delayed_refs;
2261         spin_lock(&delayed_refs->lock);
2262         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2263         if (!head) {
2264                 spin_unlock(&delayed_refs->lock);
2265                 btrfs_put_transaction(cur_trans);
2266                 return 0;
2267         }
2268 
2269         if (!mutex_trylock(&head->mutex)) {
2270                 if (path->nowait) {
2271                         spin_unlock(&delayed_refs->lock);
2272                         btrfs_put_transaction(cur_trans);
2273                         return -EAGAIN;
2274                 }
2275 
2276                 refcount_inc(&head->refs);
2277                 spin_unlock(&delayed_refs->lock);
2278 
2279                 btrfs_release_path(path);
2280 
2281                 /*
2282                  * Mutex was contended, block until it's released and let
2283                  * caller try again
2284                  */
2285                 mutex_lock(&head->mutex);
2286                 mutex_unlock(&head->mutex);
2287                 btrfs_put_delayed_ref_head(head);
2288                 btrfs_put_transaction(cur_trans);
2289                 return -EAGAIN;
2290         }
2291         spin_unlock(&delayed_refs->lock);
2292 
2293         spin_lock(&head->lock);
2294         /*
2295          * XXX: We should replace this with a proper search function in the
2296          * future.
2297          */
2298         for (node = rb_first_cached(&head->ref_tree); node;
2299              node = rb_next(node)) {
2300                 u64 ref_owner;
2301                 u64 ref_offset;
2302 
2303                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2304                 /* If it's a shared ref we know a cross reference exists */
2305                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2306                         ret = 1;
2307                         break;
2308                 }
2309 
2310                 ref_owner = btrfs_delayed_ref_owner(ref);
2311                 ref_offset = btrfs_delayed_ref_offset(ref);
2312 
2313                 /*
2314                  * If our ref doesn't match the one we're currently looking at
2315                  * then we have a cross reference.
2316                  */
2317                 if (ref->ref_root != btrfs_root_id(root) ||
2318                     ref_owner != objectid || ref_offset != offset) {
2319                         ret = 1;
2320                         break;
2321                 }
2322         }
2323         spin_unlock(&head->lock);
2324         mutex_unlock(&head->mutex);
2325         btrfs_put_transaction(cur_trans);
2326         return ret;
2327 }
2328 
2329 static noinline int check_committed_ref(struct btrfs_root *root,
2330                                         struct btrfs_path *path,
2331                                         u64 objectid, u64 offset, u64 bytenr,
2332                                         bool strict)
2333 {
2334         struct btrfs_fs_info *fs_info = root->fs_info;
2335         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2336         struct extent_buffer *leaf;
2337         struct btrfs_extent_data_ref *ref;
2338         struct btrfs_extent_inline_ref *iref;
2339         struct btrfs_extent_item *ei;
2340         struct btrfs_key key;
2341         u32 item_size;
2342         u32 expected_size;
2343         int type;
2344         int ret;
2345 
2346         key.objectid = bytenr;
2347         key.offset = (u64)-1;
2348         key.type = BTRFS_EXTENT_ITEM_KEY;
2349 
2350         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2351         if (ret < 0)
2352                 goto out;
2353         if (ret == 0) {
2354                 /*
2355                  * Key with offset -1 found, there would have to exist an extent
2356                  * item with such offset, but this is out of the valid range.
2357                  */
2358                 ret = -EUCLEAN;
2359                 goto out;
2360         }
2361 
2362         ret = -ENOENT;
2363         if (path->slots[0] == 0)
2364                 goto out;
2365 
2366         path->slots[0]--;
2367         leaf = path->nodes[0];
2368         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2369 
2370         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2371                 goto out;
2372 
2373         ret = 1;
2374         item_size = btrfs_item_size(leaf, path->slots[0]);
2375         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2376         expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2377 
2378         /* No inline refs; we need to bail before checking for owner ref. */
2379         if (item_size == sizeof(*ei))
2380                 goto out;
2381 
2382         /* Check for an owner ref; skip over it to the real inline refs. */
2383         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2384         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2385         if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2386                 expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2387                 iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2388         }
2389 
2390         /* If extent item has more than 1 inline ref then it's shared */
2391         if (item_size != expected_size)
2392                 goto out;
2393 
2394         /*
2395          * If extent created before last snapshot => it's shared unless the
2396          * snapshot has been deleted. Use the heuristic if strict is false.
2397          */
2398         if (!strict &&
2399             (btrfs_extent_generation(leaf, ei) <=
2400              btrfs_root_last_snapshot(&root->root_item)))
2401                 goto out;
2402 
2403         /* If this extent has SHARED_DATA_REF then it's shared */
2404         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2405         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2406                 goto out;
2407 
2408         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2409         if (btrfs_extent_refs(leaf, ei) !=
2410             btrfs_extent_data_ref_count(leaf, ref) ||
2411             btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
2412             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2413             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2414                 goto out;
2415 
2416         ret = 0;
2417 out:
2418         return ret;
2419 }
2420 
2421 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2422                           u64 bytenr, bool strict, struct btrfs_path *path)
2423 {
2424         int ret;
2425 
2426         do {
2427                 ret = check_committed_ref(root, path, objectid,
2428                                           offset, bytenr, strict);
2429                 if (ret && ret != -ENOENT)
2430                         goto out;
2431 
2432                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2433         } while (ret == -EAGAIN);
2434 
2435 out:
2436         btrfs_release_path(path);
2437         if (btrfs_is_data_reloc_root(root))
2438                 WARN_ON(ret > 0);
2439         return ret;
2440 }
2441 
2442 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2443                            struct btrfs_root *root,
2444                            struct extent_buffer *buf,
2445                            int full_backref, int inc)
2446 {
2447         struct btrfs_fs_info *fs_info = root->fs_info;
2448         u64 parent;
2449         u64 ref_root;
2450         u32 nritems;
2451         struct btrfs_key key;
2452         struct btrfs_file_extent_item *fi;
2453         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2454         int i;
2455         int action;
2456         int level;
2457         int ret = 0;
2458 
2459         if (btrfs_is_testing(fs_info))
2460                 return 0;
2461 
2462         ref_root = btrfs_header_owner(buf);
2463         nritems = btrfs_header_nritems(buf);
2464         level = btrfs_header_level(buf);
2465 
2466         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2467                 return 0;
2468 
2469         if (full_backref)
2470                 parent = buf->start;
2471         else
2472                 parent = 0;
2473         if (inc)
2474                 action = BTRFS_ADD_DELAYED_REF;
2475         else
2476                 action = BTRFS_DROP_DELAYED_REF;
2477 
2478         for (i = 0; i < nritems; i++) {
2479                 struct btrfs_ref ref = {
2480                         .action = action,
2481                         .parent = parent,
2482                         .ref_root = ref_root,
2483                 };
2484 
2485                 if (level == 0) {
2486                         btrfs_item_key_to_cpu(buf, &key, i);
2487                         if (key.type != BTRFS_EXTENT_DATA_KEY)
2488                                 continue;
2489                         fi = btrfs_item_ptr(buf, i,
2490                                             struct btrfs_file_extent_item);
2491                         if (btrfs_file_extent_type(buf, fi) ==
2492                             BTRFS_FILE_EXTENT_INLINE)
2493                                 continue;
2494                         ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2495                         if (ref.bytenr == 0)
2496                                 continue;
2497 
2498                         ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2499                         ref.owning_root = ref_root;
2500 
2501                         key.offset -= btrfs_file_extent_offset(buf, fi);
2502                         btrfs_init_data_ref(&ref, key.objectid, key.offset,
2503                                             btrfs_root_id(root), for_reloc);
2504                         if (inc)
2505                                 ret = btrfs_inc_extent_ref(trans, &ref);
2506                         else
2507                                 ret = btrfs_free_extent(trans, &ref);
2508                         if (ret)
2509                                 goto fail;
2510                 } else {
2511                         /* We don't know the owning_root, leave as 0. */
2512                         ref.bytenr = btrfs_node_blockptr(buf, i);
2513                         ref.num_bytes = fs_info->nodesize;
2514 
2515                         btrfs_init_tree_ref(&ref, level - 1,
2516                                             btrfs_root_id(root), for_reloc);
2517                         if (inc)
2518                                 ret = btrfs_inc_extent_ref(trans, &ref);
2519                         else
2520                                 ret = btrfs_free_extent(trans, &ref);
2521                         if (ret)
2522                                 goto fail;
2523                 }
2524         }
2525         return 0;
2526 fail:
2527         return ret;
2528 }
2529 
2530 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2531                   struct extent_buffer *buf, int full_backref)
2532 {
2533         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2534 }
2535 
2536 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2537                   struct extent_buffer *buf, int full_backref)
2538 {
2539         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2540 }
2541 
2542 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2543 {
2544         struct btrfs_fs_info *fs_info = root->fs_info;
2545         u64 flags;
2546         u64 ret;
2547 
2548         if (data)
2549                 flags = BTRFS_BLOCK_GROUP_DATA;
2550         else if (root == fs_info->chunk_root)
2551                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2552         else
2553                 flags = BTRFS_BLOCK_GROUP_METADATA;
2554 
2555         ret = btrfs_get_alloc_profile(fs_info, flags);
2556         return ret;
2557 }
2558 
2559 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2560 {
2561         struct rb_node *leftmost;
2562         u64 bytenr = 0;
2563 
2564         read_lock(&fs_info->block_group_cache_lock);
2565         /* Get the block group with the lowest logical start address. */
2566         leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2567         if (leftmost) {
2568                 struct btrfs_block_group *bg;
2569 
2570                 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2571                 bytenr = bg->start;
2572         }
2573         read_unlock(&fs_info->block_group_cache_lock);
2574 
2575         return bytenr;
2576 }
2577 
2578 static int pin_down_extent(struct btrfs_trans_handle *trans,
2579                            struct btrfs_block_group *cache,
2580                            u64 bytenr, u64 num_bytes, int reserved)
2581 {
2582         struct btrfs_fs_info *fs_info = cache->fs_info;
2583 
2584         spin_lock(&cache->space_info->lock);
2585         spin_lock(&cache->lock);
2586         cache->pinned += num_bytes;
2587         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2588                                              num_bytes);
2589         if (reserved) {
2590                 cache->reserved -= num_bytes;
2591                 cache->space_info->bytes_reserved -= num_bytes;
2592         }
2593         spin_unlock(&cache->lock);
2594         spin_unlock(&cache->space_info->lock);
2595 
2596         set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2597                        bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2598         return 0;
2599 }
2600 
2601 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2602                      u64 bytenr, u64 num_bytes, int reserved)
2603 {
2604         struct btrfs_block_group *cache;
2605 
2606         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2607         BUG_ON(!cache); /* Logic error */
2608 
2609         pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2610 
2611         btrfs_put_block_group(cache);
2612         return 0;
2613 }
2614 
2615 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2616                                     const struct extent_buffer *eb)
2617 {
2618         struct btrfs_block_group *cache;
2619         int ret;
2620 
2621         cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
2622         if (!cache)
2623                 return -EINVAL;
2624 
2625         /*
2626          * Fully cache the free space first so that our pin removes the free space
2627          * from the cache.
2628          */
2629         ret = btrfs_cache_block_group(cache, true);
2630         if (ret)
2631                 goto out;
2632 
2633         pin_down_extent(trans, cache, eb->start, eb->len, 0);
2634 
2635         /* remove us from the free space cache (if we're there at all) */
2636         ret = btrfs_remove_free_space(cache, eb->start, eb->len);
2637 out:
2638         btrfs_put_block_group(cache);
2639         return ret;
2640 }
2641 
2642 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2643                                    u64 start, u64 num_bytes)
2644 {
2645         int ret;
2646         struct btrfs_block_group *block_group;
2647 
2648         block_group = btrfs_lookup_block_group(fs_info, start);
2649         if (!block_group)
2650                 return -EINVAL;
2651 
2652         ret = btrfs_cache_block_group(block_group, true);
2653         if (ret)
2654                 goto out;
2655 
2656         ret = btrfs_remove_free_space(block_group, start, num_bytes);
2657 out:
2658         btrfs_put_block_group(block_group);
2659         return ret;
2660 }
2661 
2662 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2663 {
2664         struct btrfs_fs_info *fs_info = eb->fs_info;
2665         struct btrfs_file_extent_item *item;
2666         struct btrfs_key key;
2667         int found_type;
2668         int i;
2669         int ret = 0;
2670 
2671         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2672                 return 0;
2673 
2674         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2675                 btrfs_item_key_to_cpu(eb, &key, i);
2676                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2677                         continue;
2678                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2679                 found_type = btrfs_file_extent_type(eb, item);
2680                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2681                         continue;
2682                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2683                         continue;
2684                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2685                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2686                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2687                 if (ret)
2688                         break;
2689         }
2690 
2691         return ret;
2692 }
2693 
2694 static void
2695 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2696 {
2697         atomic_inc(&bg->reservations);
2698 }
2699 
2700 /*
2701  * Returns the free cluster for the given space info and sets empty_cluster to
2702  * what it should be based on the mount options.
2703  */
2704 static struct btrfs_free_cluster *
2705 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2706                    struct btrfs_space_info *space_info, u64 *empty_cluster)
2707 {
2708         struct btrfs_free_cluster *ret = NULL;
2709 
2710         *empty_cluster = 0;
2711         if (btrfs_mixed_space_info(space_info))
2712                 return ret;
2713 
2714         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2715                 ret = &fs_info->meta_alloc_cluster;
2716                 if (btrfs_test_opt(fs_info, SSD))
2717                         *empty_cluster = SZ_2M;
2718                 else
2719                         *empty_cluster = SZ_64K;
2720         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2721                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
2722                 *empty_cluster = SZ_2M;
2723                 ret = &fs_info->data_alloc_cluster;
2724         }
2725 
2726         return ret;
2727 }
2728 
2729 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2730                               u64 start, u64 end,
2731                               const bool return_free_space)
2732 {
2733         struct btrfs_block_group *cache = NULL;
2734         struct btrfs_space_info *space_info;
2735         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2736         struct btrfs_free_cluster *cluster = NULL;
2737         u64 len;
2738         u64 total_unpinned = 0;
2739         u64 empty_cluster = 0;
2740         bool readonly;
2741         int ret = 0;
2742 
2743         while (start <= end) {
2744                 readonly = false;
2745                 if (!cache ||
2746                     start >= cache->start + cache->length) {
2747                         if (cache)
2748                                 btrfs_put_block_group(cache);
2749                         total_unpinned = 0;
2750                         cache = btrfs_lookup_block_group(fs_info, start);
2751                         if (cache == NULL) {
2752                                 /* Logic error, something removed the block group. */
2753                                 ret = -EUCLEAN;
2754                                 goto out;
2755                         }
2756 
2757                         cluster = fetch_cluster_info(fs_info,
2758                                                      cache->space_info,
2759                                                      &empty_cluster);
2760                         empty_cluster <<= 1;
2761                 }
2762 
2763                 len = cache->start + cache->length - start;
2764                 len = min(len, end + 1 - start);
2765 
2766                 if (return_free_space)
2767                         btrfs_add_free_space(cache, start, len);
2768 
2769                 start += len;
2770                 total_unpinned += len;
2771                 space_info = cache->space_info;
2772 
2773                 /*
2774                  * If this space cluster has been marked as fragmented and we've
2775                  * unpinned enough in this block group to potentially allow a
2776                  * cluster to be created inside of it go ahead and clear the
2777                  * fragmented check.
2778                  */
2779                 if (cluster && cluster->fragmented &&
2780                     total_unpinned > empty_cluster) {
2781                         spin_lock(&cluster->lock);
2782                         cluster->fragmented = 0;
2783                         spin_unlock(&cluster->lock);
2784                 }
2785 
2786                 spin_lock(&space_info->lock);
2787                 spin_lock(&cache->lock);
2788                 cache->pinned -= len;
2789                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2790                 space_info->max_extent_size = 0;
2791                 if (cache->ro) {
2792                         space_info->bytes_readonly += len;
2793                         readonly = true;
2794                 } else if (btrfs_is_zoned(fs_info)) {
2795                         /* Need reset before reusing in a zoned block group */
2796                         btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
2797                                                                     len);
2798                         readonly = true;
2799                 }
2800                 spin_unlock(&cache->lock);
2801                 if (!readonly && return_free_space &&
2802                     global_rsv->space_info == space_info) {
2803                         spin_lock(&global_rsv->lock);
2804                         if (!global_rsv->full) {
2805                                 u64 to_add = min(len, global_rsv->size -
2806                                                       global_rsv->reserved);
2807 
2808                                 global_rsv->reserved += to_add;
2809                                 btrfs_space_info_update_bytes_may_use(fs_info,
2810                                                 space_info, to_add);
2811                                 if (global_rsv->reserved >= global_rsv->size)
2812                                         global_rsv->full = 1;
2813                                 len -= to_add;
2814                         }
2815                         spin_unlock(&global_rsv->lock);
2816                 }
2817                 /* Add to any tickets we may have */
2818                 if (!readonly && return_free_space && len)
2819                         btrfs_try_granting_tickets(fs_info, space_info);
2820                 spin_unlock(&space_info->lock);
2821         }
2822 
2823         if (cache)
2824                 btrfs_put_block_group(cache);
2825 out:
2826         return ret;
2827 }
2828 
2829 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2830 {
2831         struct btrfs_fs_info *fs_info = trans->fs_info;
2832         struct btrfs_block_group *block_group, *tmp;
2833         struct list_head *deleted_bgs;
2834         struct extent_io_tree *unpin;
2835         u64 start;
2836         u64 end;
2837         int ret;
2838 
2839         unpin = &trans->transaction->pinned_extents;
2840 
2841         while (!TRANS_ABORTED(trans)) {
2842                 struct extent_state *cached_state = NULL;
2843 
2844                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
2845                 if (!find_first_extent_bit(unpin, 0, &start, &end,
2846                                            EXTENT_DIRTY, &cached_state)) {
2847                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2848                         break;
2849                 }
2850 
2851                 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2852                         ret = btrfs_discard_extent(fs_info, start,
2853                                                    end + 1 - start, NULL);
2854 
2855                 clear_extent_dirty(unpin, start, end, &cached_state);
2856                 ret = unpin_extent_range(fs_info, start, end, true);
2857                 BUG_ON(ret);
2858                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2859                 free_extent_state(cached_state);
2860                 cond_resched();
2861         }
2862 
2863         if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2864                 btrfs_discard_calc_delay(&fs_info->discard_ctl);
2865                 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2866         }
2867 
2868         /*
2869          * Transaction is finished.  We don't need the lock anymore.  We
2870          * do need to clean up the block groups in case of a transaction
2871          * abort.
2872          */
2873         deleted_bgs = &trans->transaction->deleted_bgs;
2874         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2875                 u64 trimmed = 0;
2876 
2877                 ret = -EROFS;
2878                 if (!TRANS_ABORTED(trans))
2879                         ret = btrfs_discard_extent(fs_info,
2880                                                    block_group->start,
2881                                                    block_group->length,
2882                                                    &trimmed);
2883 
2884                 list_del_init(&block_group->bg_list);
2885                 btrfs_unfreeze_block_group(block_group);
2886                 btrfs_put_block_group(block_group);
2887 
2888                 if (ret) {
2889                         const char *errstr = btrfs_decode_error(ret);
2890                         btrfs_warn(fs_info,
2891                            "discard failed while removing blockgroup: errno=%d %s",
2892                                    ret, errstr);
2893                 }
2894         }
2895 
2896         return 0;
2897 }
2898 
2899 /*
2900  * Parse an extent item's inline extents looking for a simple quotas owner ref.
2901  *
2902  * @fs_info:    the btrfs_fs_info for this mount
2903  * @leaf:       a leaf in the extent tree containing the extent item
2904  * @slot:       the slot in the leaf where the extent item is found
2905  *
2906  * Returns the objectid of the root that originally allocated the extent item
2907  * if the inline owner ref is expected and present, otherwise 0.
2908  *
2909  * If an extent item has an owner ref item, it will be the first inline ref
2910  * item. Therefore the logic is to check whether there are any inline ref
2911  * items, then check the type of the first one.
2912  */
2913 u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2914                                 struct extent_buffer *leaf, int slot)
2915 {
2916         struct btrfs_extent_item *ei;
2917         struct btrfs_extent_inline_ref *iref;
2918         struct btrfs_extent_owner_ref *oref;
2919         unsigned long ptr;
2920         unsigned long end;
2921         int type;
2922 
2923         if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2924                 return 0;
2925 
2926         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2927         ptr = (unsigned long)(ei + 1);
2928         end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2929 
2930         /* No inline ref items of any kind, can't check type. */
2931         if (ptr == end)
2932                 return 0;
2933 
2934         iref = (struct btrfs_extent_inline_ref *)ptr;
2935         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2936 
2937         /* We found an owner ref, get the root out of it. */
2938         if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2939                 oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2940                 return btrfs_extent_owner_ref_root_id(leaf, oref);
2941         }
2942 
2943         /* We have inline refs, but not an owner ref. */
2944         return 0;
2945 }
2946 
2947 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2948                                      u64 bytenr, struct btrfs_squota_delta *delta)
2949 {
2950         int ret;
2951         u64 num_bytes = delta->num_bytes;
2952 
2953         if (delta->is_data) {
2954                 struct btrfs_root *csum_root;
2955 
2956                 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2957                 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2958                 if (ret) {
2959                         btrfs_abort_transaction(trans, ret);
2960                         return ret;
2961                 }
2962 
2963                 ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2964                 if (ret) {
2965                         btrfs_abort_transaction(trans, ret);
2966                         return ret;
2967                 }
2968         }
2969 
2970         ret = btrfs_record_squota_delta(trans->fs_info, delta);
2971         if (ret) {
2972                 btrfs_abort_transaction(trans, ret);
2973                 return ret;
2974         }
2975 
2976         ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2977         if (ret) {
2978                 btrfs_abort_transaction(trans, ret);
2979                 return ret;
2980         }
2981 
2982         ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2983         if (ret)
2984                 btrfs_abort_transaction(trans, ret);
2985 
2986         return ret;
2987 }
2988 
2989 #define abort_and_dump(trans, path, fmt, args...)       \
2990 ({                                                      \
2991         btrfs_abort_transaction(trans, -EUCLEAN);       \
2992         btrfs_print_leaf(path->nodes[0]);               \
2993         btrfs_crit(trans->fs_info, fmt, ##args);        \
2994 })
2995 
2996 /*
2997  * Drop one or more refs of @node.
2998  *
2999  * 1. Locate the extent refs.
3000  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3001  *    Locate it, then reduce the refs number or remove the ref line completely.
3002  *
3003  * 2. Update the refs count in EXTENT/METADATA_ITEM
3004  *
3005  * Inline backref case:
3006  *
3007  * in extent tree we have:
3008  *
3009  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3010  *              refs 2 gen 6 flags DATA
3011  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
3012  *              extent data backref root FS_TREE objectid 257 offset 0 count 1
3013  *
3014  * This function gets called with:
3015  *
3016  *    node->bytenr = 13631488
3017  *    node->num_bytes = 1048576
3018  *    root_objectid = FS_TREE
3019  *    owner_objectid = 257
3020  *    owner_offset = 0
3021  *    refs_to_drop = 1
3022  *
3023  * Then we should get some like:
3024  *
3025  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3026  *              refs 1 gen 6 flags DATA
3027  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
3028  *
3029  * Keyed backref case:
3030  *
3031  * in extent tree we have:
3032  *
3033  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3034  *              refs 754 gen 6 flags DATA
3035  *      [...]
3036  *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3037  *              extent data backref root FS_TREE objectid 866 offset 0 count 1
3038  *
3039  * This function get called with:
3040  *
3041  *    node->bytenr = 13631488
3042  *    node->num_bytes = 1048576
3043  *    root_objectid = FS_TREE
3044  *    owner_objectid = 866
3045  *    owner_offset = 0
3046  *    refs_to_drop = 1
3047  *
3048  * Then we should get some like:
3049  *
3050  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3051  *              refs 753 gen 6 flags DATA
3052  *
3053  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3054  */
3055 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3056                                struct btrfs_delayed_ref_head *href,
3057                                struct btrfs_delayed_ref_node *node,
3058                                struct btrfs_delayed_extent_op *extent_op)
3059 {
3060         struct btrfs_fs_info *info = trans->fs_info;
3061         struct btrfs_key key;
3062         struct btrfs_path *path;
3063         struct btrfs_root *extent_root;
3064         struct extent_buffer *leaf;
3065         struct btrfs_extent_item *ei;
3066         struct btrfs_extent_inline_ref *iref;
3067         int ret;
3068         int is_data;
3069         int extent_slot = 0;
3070         int found_extent = 0;
3071         int num_to_del = 1;
3072         int refs_to_drop = node->ref_mod;
3073         u32 item_size;
3074         u64 refs;
3075         u64 bytenr = node->bytenr;
3076         u64 num_bytes = node->num_bytes;
3077         u64 owner_objectid = btrfs_delayed_ref_owner(node);
3078         u64 owner_offset = btrfs_delayed_ref_offset(node);
3079         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
3080         u64 delayed_ref_root = href->owning_root;
3081 
3082         extent_root = btrfs_extent_root(info, bytenr);
3083         ASSERT(extent_root);
3084 
3085         path = btrfs_alloc_path();
3086         if (!path)
3087                 return -ENOMEM;
3088 
3089         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3090 
3091         if (!is_data && refs_to_drop != 1) {
3092                 btrfs_crit(info,
3093 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3094                            node->bytenr, refs_to_drop);
3095                 ret = -EINVAL;
3096                 btrfs_abort_transaction(trans, ret);
3097                 goto out;
3098         }
3099 
3100         if (is_data)
3101                 skinny_metadata = false;
3102 
3103         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3104                                     node->parent, node->ref_root, owner_objectid,
3105                                     owner_offset);
3106         if (ret == 0) {
3107                 /*
3108                  * Either the inline backref or the SHARED_DATA_REF/
3109                  * SHARED_BLOCK_REF is found
3110                  *
3111                  * Here is a quick path to locate EXTENT/METADATA_ITEM.
3112                  * It's possible the EXTENT/METADATA_ITEM is near current slot.
3113                  */
3114                 extent_slot = path->slots[0];
3115                 while (extent_slot >= 0) {
3116                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3117                                               extent_slot);
3118                         if (key.objectid != bytenr)
3119                                 break;
3120                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3121                             key.offset == num_bytes) {
3122                                 found_extent = 1;
3123                                 break;
3124                         }
3125                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
3126                             key.offset == owner_objectid) {
3127                                 found_extent = 1;
3128                                 break;
3129                         }
3130 
3131                         /* Quick path didn't find the EXTEMT/METADATA_ITEM */
3132                         if (path->slots[0] - extent_slot > 5)
3133                                 break;
3134                         extent_slot--;
3135                 }
3136 
3137                 if (!found_extent) {
3138                         if (iref) {
3139                                 abort_and_dump(trans, path,
3140 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3141                                            path->slots[0]);
3142                                 ret = -EUCLEAN;
3143                                 goto out;
3144                         }
3145                         /* Must be SHARED_* item, remove the backref first */
3146                         ret = remove_extent_backref(trans, extent_root, path,
3147                                                     NULL, refs_to_drop, is_data);
3148                         if (ret) {
3149                                 btrfs_abort_transaction(trans, ret);
3150                                 goto out;
3151                         }
3152                         btrfs_release_path(path);
3153 
3154                         /* Slow path to locate EXTENT/METADATA_ITEM */
3155                         key.objectid = bytenr;
3156                         key.type = BTRFS_EXTENT_ITEM_KEY;
3157                         key.offset = num_bytes;
3158 
3159                         if (!is_data && skinny_metadata) {
3160                                 key.type = BTRFS_METADATA_ITEM_KEY;
3161                                 key.offset = owner_objectid;
3162                         }
3163 
3164                         ret = btrfs_search_slot(trans, extent_root,
3165                                                 &key, path, -1, 1);
3166                         if (ret > 0 && skinny_metadata && path->slots[0]) {
3167                                 /*
3168                                  * Couldn't find our skinny metadata item,
3169                                  * see if we have ye olde extent item.
3170                                  */
3171                                 path->slots[0]--;
3172                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
3173                                                       path->slots[0]);
3174                                 if (key.objectid == bytenr &&
3175                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
3176                                     key.offset == num_bytes)
3177                                         ret = 0;
3178                         }
3179 
3180                         if (ret > 0 && skinny_metadata) {
3181                                 skinny_metadata = false;
3182                                 key.objectid = bytenr;
3183                                 key.type = BTRFS_EXTENT_ITEM_KEY;
3184                                 key.offset = num_bytes;
3185                                 btrfs_release_path(path);
3186                                 ret = btrfs_search_slot(trans, extent_root,
3187                                                         &key, path, -1, 1);
3188                         }
3189 
3190                         if (ret) {
3191                                 if (ret > 0)
3192                                         btrfs_print_leaf(path->nodes[0]);
3193                                 btrfs_err(info,
3194                         "umm, got %d back from search, was looking for %llu, slot %d",
3195                                           ret, bytenr, path->slots[0]);
3196                         }
3197                         if (ret < 0) {
3198                                 btrfs_abort_transaction(trans, ret);
3199                                 goto out;
3200                         }
3201                         extent_slot = path->slots[0];
3202                 }
3203         } else if (WARN_ON(ret == -ENOENT)) {
3204                 abort_and_dump(trans, path,
3205 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3206                                bytenr, node->parent, node->ref_root, owner_objectid,
3207                                owner_offset, path->slots[0]);
3208                 goto out;
3209         } else {
3210                 btrfs_abort_transaction(trans, ret);
3211                 goto out;
3212         }
3213 
3214         leaf = path->nodes[0];
3215         item_size = btrfs_item_size(leaf, extent_slot);
3216         if (unlikely(item_size < sizeof(*ei))) {
3217                 ret = -EUCLEAN;
3218                 btrfs_err(trans->fs_info,
3219                           "unexpected extent item size, has %u expect >= %zu",
3220                           item_size, sizeof(*ei));
3221                 btrfs_abort_transaction(trans, ret);
3222                 goto out;
3223         }
3224         ei = btrfs_item_ptr(leaf, extent_slot,
3225                             struct btrfs_extent_item);
3226         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3227             key.type == BTRFS_EXTENT_ITEM_KEY) {
3228                 struct btrfs_tree_block_info *bi;
3229 
3230                 if (item_size < sizeof(*ei) + sizeof(*bi)) {
3231                         abort_and_dump(trans, path,
3232 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3233                                        key.objectid, key.type, key.offset,
3234                                        path->slots[0], owner_objectid, item_size,
3235                                        sizeof(*ei) + sizeof(*bi));
3236                         ret = -EUCLEAN;
3237                         goto out;
3238                 }
3239                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3240                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3241         }
3242 
3243         refs = btrfs_extent_refs(leaf, ei);
3244         if (refs < refs_to_drop) {
3245                 abort_and_dump(trans, path,
3246                 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3247                                refs_to_drop, refs, bytenr, path->slots[0]);
3248                 ret = -EUCLEAN;
3249                 goto out;
3250         }
3251         refs -= refs_to_drop;
3252 
3253         if (refs > 0) {
3254                 if (extent_op)
3255                         __run_delayed_extent_op(extent_op, leaf, ei);
3256                 /*
3257                  * In the case of inline back ref, reference count will
3258                  * be updated by remove_extent_backref
3259                  */
3260                 if (iref) {
3261                         if (!found_extent) {
3262                                 abort_and_dump(trans, path,
3263 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3264                                                path->slots[0]);
3265                                 ret = -EUCLEAN;
3266                                 goto out;
3267                         }
3268                 } else {
3269                         btrfs_set_extent_refs(leaf, ei, refs);
3270                         btrfs_mark_buffer_dirty(trans, leaf);
3271                 }
3272                 if (found_extent) {
3273                         ret = remove_extent_backref(trans, extent_root, path,
3274                                                     iref, refs_to_drop, is_data);
3275                         if (ret) {
3276                                 btrfs_abort_transaction(trans, ret);
3277                                 goto out;
3278                         }
3279                 }
3280         } else {
3281                 struct btrfs_squota_delta delta = {
3282                         .root = delayed_ref_root,
3283                         .num_bytes = num_bytes,
3284                         .is_data = is_data,
3285                         .is_inc = false,
3286                         .generation = btrfs_extent_generation(leaf, ei),
3287                 };
3288 
3289                 /* In this branch refs == 1 */
3290                 if (found_extent) {
3291                         if (is_data && refs_to_drop !=
3292                             extent_data_ref_count(path, iref)) {
3293                                 abort_and_dump(trans, path,
3294                 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3295                                                extent_data_ref_count(path, iref),
3296                                                refs_to_drop, path->slots[0]);
3297                                 ret = -EUCLEAN;
3298                                 goto out;
3299                         }
3300                         if (iref) {
3301                                 if (path->slots[0] != extent_slot) {
3302                                         abort_and_dump(trans, path,
3303 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3304                                                        key.objectid, key.type,
3305                                                        key.offset, path->slots[0]);
3306                                         ret = -EUCLEAN;
3307                                         goto out;
3308                                 }
3309                         } else {
3310                                 /*
3311                                  * No inline ref, we must be at SHARED_* item,
3312                                  * And it's single ref, it must be:
3313                                  * |    extent_slot       ||extent_slot + 1|
3314                                  * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3315                                  */
3316                                 if (path->slots[0] != extent_slot + 1) {
3317                                         abort_and_dump(trans, path,
3318         "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3319                                                        path->slots[0]);
3320                                         ret = -EUCLEAN;
3321                                         goto out;
3322                                 }
3323                                 path->slots[0] = extent_slot;
3324                                 num_to_del = 2;
3325                         }
3326                 }
3327                 /*
3328                  * We can't infer the data owner from the delayed ref, so we need
3329                  * to try to get it from the owning ref item.
3330                  *
3331                  * If it is not present, then that extent was not written under
3332                  * simple quotas mode, so we don't need to account for its deletion.
3333                  */
3334                 if (is_data)
3335                         delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3336                                                                  leaf, extent_slot);
3337 
3338                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3339                                       num_to_del);
3340                 if (ret) {
3341                         btrfs_abort_transaction(trans, ret);
3342                         goto out;
3343                 }
3344                 btrfs_release_path(path);
3345 
3346                 ret = do_free_extent_accounting(trans, bytenr, &delta);
3347         }
3348         btrfs_release_path(path);
3349 
3350 out:
3351         btrfs_free_path(path);
3352         return ret;
3353 }
3354 
3355 /*
3356  * when we free an block, it is possible (and likely) that we free the last
3357  * delayed ref for that extent as well.  This searches the delayed ref tree for
3358  * a given extent, and if there are no other delayed refs to be processed, it
3359  * removes it from the tree.
3360  */
3361 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3362                                       u64 bytenr)
3363 {
3364         struct btrfs_delayed_ref_head *head;
3365         struct btrfs_delayed_ref_root *delayed_refs;
3366         int ret = 0;
3367 
3368         delayed_refs = &trans->transaction->delayed_refs;
3369         spin_lock(&delayed_refs->lock);
3370         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3371         if (!head)
3372                 goto out_delayed_unlock;
3373 
3374         spin_lock(&head->lock);
3375         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3376                 goto out;
3377 
3378         if (cleanup_extent_op(head) != NULL)
3379                 goto out;
3380 
3381         /*
3382          * waiting for the lock here would deadlock.  If someone else has it
3383          * locked they are already in the process of dropping it anyway
3384          */
3385         if (!mutex_trylock(&head->mutex))
3386                 goto out;
3387 
3388         btrfs_delete_ref_head(delayed_refs, head);
3389         head->processing = false;
3390 
3391         spin_unlock(&head->lock);
3392         spin_unlock(&delayed_refs->lock);
3393 
3394         BUG_ON(head->extent_op);
3395         if (head->must_insert_reserved)
3396                 ret = 1;
3397 
3398         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3399         mutex_unlock(&head->mutex);
3400         btrfs_put_delayed_ref_head(head);
3401         return ret;
3402 out:
3403         spin_unlock(&head->lock);
3404 
3405 out_delayed_unlock:
3406         spin_unlock(&delayed_refs->lock);
3407         return 0;
3408 }
3409 
3410 int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3411                           u64 root_id,
3412                           struct extent_buffer *buf,
3413                           u64 parent, int last_ref)
3414 {
3415         struct btrfs_fs_info *fs_info = trans->fs_info;
3416         struct btrfs_block_group *bg;
3417         int ret;
3418 
3419         if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3420                 struct btrfs_ref generic_ref = {
3421                         .action = BTRFS_DROP_DELAYED_REF,
3422                         .bytenr = buf->start,
3423                         .num_bytes = buf->len,
3424                         .parent = parent,
3425                         .owning_root = btrfs_header_owner(buf),
3426                         .ref_root = root_id,
3427                 };
3428 
3429                 /*
3430                  * Assert that the extent buffer is not cleared due to
3431                  * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3432                  * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3433                  * detail.
3434                  */
3435                 ASSERT(btrfs_header_bytenr(buf) != 0);
3436 
3437                 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
3438                 btrfs_ref_tree_mod(fs_info, &generic_ref);
3439                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3440                 if (ret < 0)
3441                         return ret;
3442         }
3443 
3444         if (!last_ref)
3445                 return 0;
3446 
3447         if (btrfs_header_generation(buf) != trans->transid)
3448                 goto out;
3449 
3450         if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3451                 ret = check_ref_cleanup(trans, buf->start);
3452                 if (!ret)
3453                         goto out;
3454         }
3455 
3456         bg = btrfs_lookup_block_group(fs_info, buf->start);
3457 
3458         if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3459                 pin_down_extent(trans, bg, buf->start, buf->len, 1);
3460                 btrfs_put_block_group(bg);
3461                 goto out;
3462         }
3463 
3464         /*
3465          * If there are tree mod log users we may have recorded mod log
3466          * operations for this node.  If we re-allocate this node we
3467          * could replay operations on this node that happened when it
3468          * existed in a completely different root.  For example if it
3469          * was part of root A, then was reallocated to root B, and we
3470          * are doing a btrfs_old_search_slot(root b), we could replay
3471          * operations that happened when the block was part of root A,
3472          * giving us an inconsistent view of the btree.
3473          *
3474          * We are safe from races here because at this point no other
3475          * node or root points to this extent buffer, so if after this
3476          * check a new tree mod log user joins we will not have an
3477          * existing log of operations on this node that we have to
3478          * contend with.
3479          */
3480 
3481         if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3482                      || btrfs_is_zoned(fs_info)) {
3483                 pin_down_extent(trans, bg, buf->start, buf->len, 1);
3484                 btrfs_put_block_group(bg);
3485                 goto out;
3486         }
3487 
3488         WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3489 
3490         btrfs_add_free_space(bg, buf->start, buf->len);
3491         btrfs_free_reserved_bytes(bg, buf->len, 0);
3492         btrfs_put_block_group(bg);
3493         trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3494 
3495 out:
3496 
3497         /*
3498          * Deleting the buffer, clear the corrupt flag since it doesn't
3499          * matter anymore.
3500          */
3501         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3502         return 0;
3503 }
3504 
3505 /* Can return -ENOMEM */
3506 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3507 {
3508         struct btrfs_fs_info *fs_info = trans->fs_info;
3509         int ret;
3510 
3511         if (btrfs_is_testing(fs_info))
3512                 return 0;
3513 
3514         /*
3515          * tree log blocks never actually go into the extent allocation
3516          * tree, just update pinning info and exit early.
3517          */
3518         if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
3519                 btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
3520                 ret = 0;
3521         } else if (ref->type == BTRFS_REF_METADATA) {
3522                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3523         } else {
3524                 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3525         }
3526 
3527         if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
3528                 btrfs_ref_tree_mod(fs_info, ref);
3529 
3530         return ret;
3531 }
3532 
3533 enum btrfs_loop_type {
3534         /*
3535          * Start caching block groups but do not wait for progress or for them
3536          * to be done.
3537          */
3538         LOOP_CACHING_NOWAIT,
3539 
3540         /*
3541          * Wait for the block group free_space >= the space we're waiting for if
3542          * the block group isn't cached.
3543          */
3544         LOOP_CACHING_WAIT,
3545 
3546         /*
3547          * Allow allocations to happen from block groups that do not yet have a
3548          * size classification.
3549          */
3550         LOOP_UNSET_SIZE_CLASS,
3551 
3552         /*
3553          * Allocate a chunk and then retry the allocation.
3554          */
3555         LOOP_ALLOC_CHUNK,
3556 
3557         /*
3558          * Ignore the size class restrictions for this allocation.
3559          */
3560         LOOP_WRONG_SIZE_CLASS,
3561 
3562         /*
3563          * Ignore the empty size, only try to allocate the number of bytes
3564          * needed for this allocation.
3565          */
3566         LOOP_NO_EMPTY_SIZE,
3567 };
3568 
3569 static inline void
3570 btrfs_lock_block_group(struct btrfs_block_group *cache,
3571                        int delalloc)
3572 {
3573         if (delalloc)
3574                 down_read(&cache->data_rwsem);
3575 }
3576 
3577 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3578                        int delalloc)
3579 {
3580         btrfs_get_block_group(cache);
3581         if (delalloc)
3582                 down_read(&cache->data_rwsem);
3583 }
3584 
3585 static struct btrfs_block_group *btrfs_lock_cluster(
3586                    struct btrfs_block_group *block_group,
3587                    struct btrfs_free_cluster *cluster,
3588                    int delalloc)
3589         __acquires(&cluster->refill_lock)
3590 {
3591         struct btrfs_block_group *used_bg = NULL;
3592 
3593         spin_lock(&cluster->refill_lock);
3594         while (1) {
3595                 used_bg = cluster->block_group;
3596                 if (!used_bg)
3597                         return NULL;
3598 
3599                 if (used_bg == block_group)
3600                         return used_bg;
3601 
3602                 btrfs_get_block_group(used_bg);
3603 
3604                 if (!delalloc)
3605                         return used_bg;
3606 
3607                 if (down_read_trylock(&used_bg->data_rwsem))
3608                         return used_bg;
3609 
3610                 spin_unlock(&cluster->refill_lock);
3611 
3612                 /* We should only have one-level nested. */
3613                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3614 
3615                 spin_lock(&cluster->refill_lock);
3616                 if (used_bg == cluster->block_group)
3617                         return used_bg;
3618 
3619                 up_read(&used_bg->data_rwsem);
3620                 btrfs_put_block_group(used_bg);
3621         }
3622 }
3623 
3624 static inline void
3625 btrfs_release_block_group(struct btrfs_block_group *cache,
3626                          int delalloc)
3627 {
3628         if (delalloc)
3629                 up_read(&cache->data_rwsem);
3630         btrfs_put_block_group(cache);
3631 }
3632 
3633 /*
3634  * Helper function for find_free_extent().
3635  *
3636  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3637  * Return >0 to inform caller that we find nothing
3638  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3639  */
3640 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3641                                       struct find_free_extent_ctl *ffe_ctl,
3642                                       struct btrfs_block_group **cluster_bg_ret)
3643 {
3644         struct btrfs_block_group *cluster_bg;
3645         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3646         u64 aligned_cluster;
3647         u64 offset;
3648         int ret;
3649 
3650         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3651         if (!cluster_bg)
3652                 goto refill_cluster;
3653         if (cluster_bg != bg && (cluster_bg->ro ||
3654             !block_group_bits(cluster_bg, ffe_ctl->flags)))
3655                 goto release_cluster;
3656 
3657         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3658                         ffe_ctl->num_bytes, cluster_bg->start,
3659                         &ffe_ctl->max_extent_size);
3660         if (offset) {
3661                 /* We have a block, we're done */
3662                 spin_unlock(&last_ptr->refill_lock);
3663                 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3664                 *cluster_bg_ret = cluster_bg;
3665                 ffe_ctl->found_offset = offset;
3666                 return 0;
3667         }
3668         WARN_ON(last_ptr->block_group != cluster_bg);
3669 
3670 release_cluster:
3671         /*
3672          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3673          * lets just skip it and let the allocator find whatever block it can
3674          * find. If we reach this point, we will have tried the cluster
3675          * allocator plenty of times and not have found anything, so we are
3676          * likely way too fragmented for the clustering stuff to find anything.
3677          *
3678          * However, if the cluster is taken from the current block group,
3679          * release the cluster first, so that we stand a better chance of
3680          * succeeding in the unclustered allocation.
3681          */
3682         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3683                 spin_unlock(&last_ptr->refill_lock);
3684                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3685                 return -ENOENT;
3686         }
3687 
3688         /* This cluster didn't work out, free it and start over */
3689         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3690 
3691         if (cluster_bg != bg)
3692                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3693 
3694 refill_cluster:
3695         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3696                 spin_unlock(&last_ptr->refill_lock);
3697                 return -ENOENT;
3698         }
3699 
3700         aligned_cluster = max_t(u64,
3701                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3702                         bg->full_stripe_len);
3703         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3704                         ffe_ctl->num_bytes, aligned_cluster);
3705         if (ret == 0) {
3706                 /* Now pull our allocation out of this cluster */
3707                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3708                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
3709                                 &ffe_ctl->max_extent_size);
3710                 if (offset) {
3711                         /* We found one, proceed */
3712                         spin_unlock(&last_ptr->refill_lock);
3713                         ffe_ctl->found_offset = offset;
3714                         trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3715                         return 0;
3716                 }
3717         }
3718         /*
3719          * At this point we either didn't find a cluster or we weren't able to
3720          * allocate a block from our cluster.  Free the cluster we've been
3721          * trying to use, and go to the next block group.
3722          */
3723         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3724         spin_unlock(&last_ptr->refill_lock);
3725         return 1;
3726 }
3727 
3728 /*
3729  * Return >0 to inform caller that we find nothing
3730  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3731  */
3732 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3733                                         struct find_free_extent_ctl *ffe_ctl)
3734 {
3735         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3736         u64 offset;
3737 
3738         /*
3739          * We are doing an unclustered allocation, set the fragmented flag so
3740          * we don't bother trying to setup a cluster again until we get more
3741          * space.
3742          */
3743         if (unlikely(last_ptr)) {
3744                 spin_lock(&last_ptr->lock);
3745                 last_ptr->fragmented = 1;
3746                 spin_unlock(&last_ptr->lock);
3747         }
3748         if (ffe_ctl->cached) {
3749                 struct btrfs_free_space_ctl *free_space_ctl;
3750 
3751                 free_space_ctl = bg->free_space_ctl;
3752                 spin_lock(&free_space_ctl->tree_lock);
3753                 if (free_space_ctl->free_space <
3754                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3755                     ffe_ctl->empty_size) {
3756                         ffe_ctl->total_free_space = max_t(u64,
3757                                         ffe_ctl->total_free_space,
3758                                         free_space_ctl->free_space);
3759                         spin_unlock(&free_space_ctl->tree_lock);
3760                         return 1;
3761                 }
3762                 spin_unlock(&free_space_ctl->tree_lock);
3763         }
3764 
3765         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3766                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
3767                         &ffe_ctl->max_extent_size);
3768         if (!offset)
3769                 return 1;
3770         ffe_ctl->found_offset = offset;
3771         return 0;
3772 }
3773 
3774 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3775                                    struct find_free_extent_ctl *ffe_ctl,
3776                                    struct btrfs_block_group **bg_ret)
3777 {
3778         int ret;
3779 
3780         /* We want to try and use the cluster allocator, so lets look there */
3781         if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3782                 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3783                 if (ret >= 0)
3784                         return ret;
3785                 /* ret == -ENOENT case falls through */
3786         }
3787 
3788         return find_free_extent_unclustered(block_group, ffe_ctl);
3789 }
3790 
3791 /*
3792  * Tree-log block group locking
3793  * ============================
3794  *
3795  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3796  * indicates the starting address of a block group, which is reserved only
3797  * for tree-log metadata.
3798  *
3799  * Lock nesting
3800  * ============
3801  *
3802  * space_info::lock
3803  *   block_group::lock
3804  *     fs_info::treelog_bg_lock
3805  */
3806 
3807 /*
3808  * Simple allocator for sequential-only block group. It only allows sequential
3809  * allocation. No need to play with trees. This function also reserves the
3810  * bytes as in btrfs_add_reserved_bytes.
3811  */
3812 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3813                                struct find_free_extent_ctl *ffe_ctl,
3814                                struct btrfs_block_group **bg_ret)
3815 {
3816         struct btrfs_fs_info *fs_info = block_group->fs_info;
3817         struct btrfs_space_info *space_info = block_group->space_info;
3818         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3819         u64 start = block_group->start;
3820         u64 num_bytes = ffe_ctl->num_bytes;
3821         u64 avail;
3822         u64 bytenr = block_group->start;
3823         u64 log_bytenr;
3824         u64 data_reloc_bytenr;
3825         int ret = 0;
3826         bool skip = false;
3827 
3828         ASSERT(btrfs_is_zoned(block_group->fs_info));
3829 
3830         /*
3831          * Do not allow non-tree-log blocks in the dedicated tree-log block
3832          * group, and vice versa.
3833          */
3834         spin_lock(&fs_info->treelog_bg_lock);
3835         log_bytenr = fs_info->treelog_bg;
3836         if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3837                            (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3838                 skip = true;
3839         spin_unlock(&fs_info->treelog_bg_lock);
3840         if (skip)
3841                 return 1;
3842 
3843         /*
3844          * Do not allow non-relocation blocks in the dedicated relocation block
3845          * group, and vice versa.
3846          */
3847         spin_lock(&fs_info->relocation_bg_lock);
3848         data_reloc_bytenr = fs_info->data_reloc_bg;
3849         if (data_reloc_bytenr &&
3850             ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3851              (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3852                 skip = true;
3853         spin_unlock(&fs_info->relocation_bg_lock);
3854         if (skip)
3855                 return 1;
3856 
3857         /* Check RO and no space case before trying to activate it */
3858         spin_lock(&block_group->lock);
3859         if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3860                 ret = 1;
3861                 /*
3862                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3863                  * Return the error after taking the locks.
3864                  */
3865         }
3866         spin_unlock(&block_group->lock);
3867 
3868         /* Metadata block group is activated at write time. */
3869         if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3870             !btrfs_zone_activate(block_group)) {
3871                 ret = 1;
3872                 /*
3873                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3874                  * Return the error after taking the locks.
3875                  */
3876         }
3877 
3878         spin_lock(&space_info->lock);
3879         spin_lock(&block_group->lock);
3880         spin_lock(&fs_info->treelog_bg_lock);
3881         spin_lock(&fs_info->relocation_bg_lock);
3882 
3883         if (ret)
3884                 goto out;
3885 
3886         ASSERT(!ffe_ctl->for_treelog ||
3887                block_group->start == fs_info->treelog_bg ||
3888                fs_info->treelog_bg == 0);
3889         ASSERT(!ffe_ctl->for_data_reloc ||
3890                block_group->start == fs_info->data_reloc_bg ||
3891                fs_info->data_reloc_bg == 0);
3892 
3893         if (block_group->ro ||
3894             (!ffe_ctl->for_data_reloc &&
3895              test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3896                 ret = 1;
3897                 goto out;
3898         }
3899 
3900         /*
3901          * Do not allow currently using block group to be tree-log dedicated
3902          * block group.
3903          */
3904         if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3905             (block_group->used || block_group->reserved)) {
3906                 ret = 1;
3907                 goto out;
3908         }
3909 
3910         /*
3911          * Do not allow currently used block group to be the data relocation
3912          * dedicated block group.
3913          */
3914         if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3915             (block_group->used || block_group->reserved)) {
3916                 ret = 1;
3917                 goto out;
3918         }
3919 
3920         WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3921         avail = block_group->zone_capacity - block_group->alloc_offset;
3922         if (avail < num_bytes) {
3923                 if (ffe_ctl->max_extent_size < avail) {
3924                         /*
3925                          * With sequential allocator, free space is always
3926                          * contiguous
3927                          */
3928                         ffe_ctl->max_extent_size = avail;
3929                         ffe_ctl->total_free_space = avail;
3930                 }
3931                 ret = 1;
3932                 goto out;
3933         }
3934 
3935         if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3936                 fs_info->treelog_bg = block_group->start;
3937 
3938         if (ffe_ctl->for_data_reloc) {
3939                 if (!fs_info->data_reloc_bg)
3940                         fs_info->data_reloc_bg = block_group->start;
3941                 /*
3942                  * Do not allow allocations from this block group, unless it is
3943                  * for data relocation. Compared to increasing the ->ro, setting
3944                  * the ->zoned_data_reloc_ongoing flag still allows nocow
3945                  * writers to come in. See btrfs_inc_nocow_writers().
3946                  *
3947                  * We need to disable an allocation to avoid an allocation of
3948                  * regular (non-relocation data) extent. With mix of relocation
3949                  * extents and regular extents, we can dispatch WRITE commands
3950                  * (for relocation extents) and ZONE APPEND commands (for
3951                  * regular extents) at the same time to the same zone, which
3952                  * easily break the write pointer.
3953                  *
3954                  * Also, this flag avoids this block group to be zone finished.
3955                  */
3956                 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3957         }
3958 
3959         ffe_ctl->found_offset = start + block_group->alloc_offset;
3960         block_group->alloc_offset += num_bytes;
3961         spin_lock(&ctl->tree_lock);
3962         ctl->free_space -= num_bytes;
3963         spin_unlock(&ctl->tree_lock);
3964 
3965         /*
3966          * We do not check if found_offset is aligned to stripesize. The
3967          * address is anyway rewritten when using zone append writing.
3968          */
3969 
3970         ffe_ctl->search_start = ffe_ctl->found_offset;
3971 
3972 out:
3973         if (ret && ffe_ctl->for_treelog)
3974                 fs_info->treelog_bg = 0;
3975         if (ret && ffe_ctl->for_data_reloc)
3976                 fs_info->data_reloc_bg = 0;
3977         spin_unlock(&fs_info->relocation_bg_lock);
3978         spin_unlock(&fs_info->treelog_bg_lock);
3979         spin_unlock(&block_group->lock);
3980         spin_unlock(&space_info->lock);
3981         return ret;
3982 }
3983 
3984 static int do_allocation(struct btrfs_block_group *block_group,
3985                          struct find_free_extent_ctl *ffe_ctl,
3986                          struct btrfs_block_group **bg_ret)
3987 {
3988         switch (ffe_ctl->policy) {
3989         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3990                 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3991         case BTRFS_EXTENT_ALLOC_ZONED:
3992                 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3993         default:
3994                 BUG();
3995         }
3996 }
3997 
3998 static void release_block_group(struct btrfs_block_group *block_group,
3999                                 struct find_free_extent_ctl *ffe_ctl,
4000                                 int delalloc)
4001 {
4002         switch (ffe_ctl->policy) {
4003         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4004                 ffe_ctl->retry_uncached = false;
4005                 break;
4006         case BTRFS_EXTENT_ALLOC_ZONED:
4007                 /* Nothing to do */
4008                 break;
4009         default:
4010                 BUG();
4011         }
4012 
4013         BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4014                ffe_ctl->index);
4015         btrfs_release_block_group(block_group, delalloc);
4016 }
4017 
4018 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4019                                    struct btrfs_key *ins)
4020 {
4021         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4022 
4023         if (!ffe_ctl->use_cluster && last_ptr) {
4024                 spin_lock(&last_ptr->lock);
4025                 last_ptr->window_start = ins->objectid;
4026                 spin_unlock(&last_ptr->lock);
4027         }
4028 }
4029 
4030 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4031                          struct btrfs_key *ins)
4032 {
4033         switch (ffe_ctl->policy) {
4034         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4035                 found_extent_clustered(ffe_ctl, ins);
4036                 break;
4037         case BTRFS_EXTENT_ALLOC_ZONED:
4038                 /* Nothing to do */
4039                 break;
4040         default:
4041                 BUG();
4042         }
4043 }
4044 
4045 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4046                                     struct find_free_extent_ctl *ffe_ctl)
4047 {
4048         /* Block group's activeness is not a requirement for METADATA block groups. */
4049         if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4050                 return 0;
4051 
4052         /* If we can activate new zone, just allocate a chunk and use it */
4053         if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4054                 return 0;
4055 
4056         /*
4057          * We already reached the max active zones. Try to finish one block
4058          * group to make a room for a new block group. This is only possible
4059          * for a data block group because btrfs_zone_finish() may need to wait
4060          * for a running transaction which can cause a deadlock for metadata
4061          * allocation.
4062          */
4063         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4064                 int ret = btrfs_zone_finish_one_bg(fs_info);
4065 
4066                 if (ret == 1)
4067                         return 0;
4068                 else if (ret < 0)
4069                         return ret;
4070         }
4071 
4072         /*
4073          * If we have enough free space left in an already active block group
4074          * and we can't activate any other zone now, do not allow allocating a
4075          * new chunk and let find_free_extent() retry with a smaller size.
4076          */
4077         if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4078                 return -ENOSPC;
4079 
4080         /*
4081          * Even min_alloc_size is not left in any block groups. Since we cannot
4082          * activate a new block group, allocating it may not help. Let's tell a
4083          * caller to try again and hope it progress something by writing some
4084          * parts of the region. That is only possible for data block groups,
4085          * where a part of the region can be written.
4086          */
4087         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4088                 return -EAGAIN;
4089 
4090         /*
4091          * We cannot activate a new block group and no enough space left in any
4092          * block groups. So, allocating a new block group may not help. But,
4093          * there is nothing to do anyway, so let's go with it.
4094          */
4095         return 0;
4096 }
4097 
4098 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4099                               struct find_free_extent_ctl *ffe_ctl)
4100 {
4101         switch (ffe_ctl->policy) {
4102         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4103                 return 0;
4104         case BTRFS_EXTENT_ALLOC_ZONED:
4105                 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
4106         default:
4107                 BUG();
4108         }
4109 }
4110 
4111 /*
4112  * Return >0 means caller needs to re-search for free extent
4113  * Return 0 means we have the needed free extent.
4114  * Return <0 means we failed to locate any free extent.
4115  */
4116 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4117                                         struct btrfs_key *ins,
4118                                         struct find_free_extent_ctl *ffe_ctl,
4119                                         bool full_search)
4120 {
4121         struct btrfs_root *root = fs_info->chunk_root;
4122         int ret;
4123 
4124         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4125             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4126                 ffe_ctl->orig_have_caching_bg = true;
4127 
4128         if (ins->objectid) {
4129                 found_extent(ffe_ctl, ins);
4130                 return 0;
4131         }
4132 
4133         if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4134                 return 1;
4135 
4136         ffe_ctl->index++;
4137         if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4138                 return 1;
4139 
4140         /* See the comments for btrfs_loop_type for an explanation of the phases. */
4141         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4142                 ffe_ctl->index = 0;
4143                 /*
4144                  * We want to skip the LOOP_CACHING_WAIT step if we don't have
4145                  * any uncached bgs and we've already done a full search
4146                  * through.
4147                  */
4148                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4149                     (!ffe_ctl->orig_have_caching_bg && full_search))
4150                         ffe_ctl->loop++;
4151                 ffe_ctl->loop++;
4152 
4153                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4154                         struct btrfs_trans_handle *trans;
4155                         int exist = 0;
4156 
4157                         /* Check if allocation policy allows to create a new chunk */
4158                         ret = can_allocate_chunk(fs_info, ffe_ctl);
4159                         if (ret)
4160                                 return ret;
4161 
4162                         trans = current->journal_info;
4163                         if (trans)
4164                                 exist = 1;
4165                         else
4166                                 trans = btrfs_join_transaction(root);
4167 
4168                         if (IS_ERR(trans)) {
4169                                 ret = PTR_ERR(trans);
4170                                 return ret;
4171                         }
4172 
4173                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4174                                                 CHUNK_ALLOC_FORCE_FOR_EXTENT);
4175 
4176                         /* Do not bail out on ENOSPC since we can do more. */
4177                         if (ret == -ENOSPC) {
4178                                 ret = 0;
4179                                 ffe_ctl->loop++;
4180                         }
4181                         else if (ret < 0)
4182                                 btrfs_abort_transaction(trans, ret);
4183                         else
4184                                 ret = 0;
4185                         if (!exist)
4186                                 btrfs_end_transaction(trans);
4187                         if (ret)
4188                                 return ret;
4189                 }
4190 
4191                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4192                         if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4193                                 return -ENOSPC;
4194 
4195                         /*
4196                          * Don't loop again if we already have no empty_size and
4197                          * no empty_cluster.
4198                          */
4199                         if (ffe_ctl->empty_size == 0 &&
4200                             ffe_ctl->empty_cluster == 0)
4201                                 return -ENOSPC;
4202                         ffe_ctl->empty_size = 0;
4203                         ffe_ctl->empty_cluster = 0;
4204                 }
4205                 return 1;
4206         }
4207         return -ENOSPC;
4208 }
4209 
4210 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4211                                               struct btrfs_block_group *bg)
4212 {
4213         if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4214                 return true;
4215         if (!btrfs_block_group_should_use_size_class(bg))
4216                 return true;
4217         if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4218                 return true;
4219         if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4220             bg->size_class == BTRFS_BG_SZ_NONE)
4221                 return true;
4222         return ffe_ctl->size_class == bg->size_class;
4223 }
4224 
4225 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4226                                         struct find_free_extent_ctl *ffe_ctl,
4227                                         struct btrfs_space_info *space_info,
4228                                         struct btrfs_key *ins)
4229 {
4230         /*
4231          * If our free space is heavily fragmented we may not be able to make
4232          * big contiguous allocations, so instead of doing the expensive search
4233          * for free space, simply return ENOSPC with our max_extent_size so we
4234          * can go ahead and search for a more manageable chunk.
4235          *
4236          * If our max_extent_size is large enough for our allocation simply
4237          * disable clustering since we will likely not be able to find enough
4238          * space to create a cluster and induce latency trying.
4239          */
4240         if (space_info->max_extent_size) {
4241                 spin_lock(&space_info->lock);
4242                 if (space_info->max_extent_size &&
4243                     ffe_ctl->num_bytes > space_info->max_extent_size) {
4244                         ins->offset = space_info->max_extent_size;
4245                         spin_unlock(&space_info->lock);
4246                         return -ENOSPC;
4247                 } else if (space_info->max_extent_size) {
4248                         ffe_ctl->use_cluster = false;
4249                 }
4250                 spin_unlock(&space_info->lock);
4251         }
4252 
4253         ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4254                                                &ffe_ctl->empty_cluster);
4255         if (ffe_ctl->last_ptr) {
4256                 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4257 
4258                 spin_lock(&last_ptr->lock);
4259                 if (last_ptr->block_group)
4260                         ffe_ctl->hint_byte = last_ptr->window_start;
4261                 if (last_ptr->fragmented) {
4262                         /*
4263                          * We still set window_start so we can keep track of the
4264                          * last place we found an allocation to try and save
4265                          * some time.
4266                          */
4267                         ffe_ctl->hint_byte = last_ptr->window_start;
4268                         ffe_ctl->use_cluster = false;
4269                 }
4270                 spin_unlock(&last_ptr->lock);
4271         }
4272 
4273         return 0;
4274 }
4275 
4276 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4277                                     struct find_free_extent_ctl *ffe_ctl)
4278 {
4279         if (ffe_ctl->for_treelog) {
4280                 spin_lock(&fs_info->treelog_bg_lock);
4281                 if (fs_info->treelog_bg)
4282                         ffe_ctl->hint_byte = fs_info->treelog_bg;
4283                 spin_unlock(&fs_info->treelog_bg_lock);
4284         } else if (ffe_ctl->for_data_reloc) {
4285                 spin_lock(&fs_info->relocation_bg_lock);
4286                 if (fs_info->data_reloc_bg)
4287                         ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4288                 spin_unlock(&fs_info->relocation_bg_lock);
4289         } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4290                 struct btrfs_block_group *block_group;
4291 
4292                 spin_lock(&fs_info->zone_active_bgs_lock);
4293                 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4294                         /*
4295                          * No lock is OK here because avail is monotinically
4296                          * decreasing, and this is just a hint.
4297                          */
4298                         u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4299 
4300                         if (block_group_bits(block_group, ffe_ctl->flags) &&
4301                             avail >= ffe_ctl->num_bytes) {
4302                                 ffe_ctl->hint_byte = block_group->start;
4303                                 break;
4304                         }
4305                 }
4306                 spin_unlock(&fs_info->zone_active_bgs_lock);
4307         }
4308 
4309         return 0;
4310 }
4311 
4312 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4313                               struct find_free_extent_ctl *ffe_ctl,
4314                               struct btrfs_space_info *space_info,
4315                               struct btrfs_key *ins)
4316 {
4317         switch (ffe_ctl->policy) {
4318         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4319                 return prepare_allocation_clustered(fs_info, ffe_ctl,
4320                                                     space_info, ins);
4321         case BTRFS_EXTENT_ALLOC_ZONED:
4322                 return prepare_allocation_zoned(fs_info, ffe_ctl);
4323         default:
4324                 BUG();
4325         }
4326 }
4327 
4328 /*
4329  * walks the btree of allocated extents and find a hole of a given size.
4330  * The key ins is changed to record the hole:
4331  * ins->objectid == start position
4332  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4333  * ins->offset == the size of the hole.
4334  * Any available blocks before search_start are skipped.
4335  *
4336  * If there is no suitable free space, we will record the max size of
4337  * the free space extent currently.
4338  *
4339  * The overall logic and call chain:
4340  *
4341  * find_free_extent()
4342  * |- Iterate through all block groups
4343  * |  |- Get a valid block group
4344  * |  |- Try to do clustered allocation in that block group
4345  * |  |- Try to do unclustered allocation in that block group
4346  * |  |- Check if the result is valid
4347  * |  |  |- If valid, then exit
4348  * |  |- Jump to next block group
4349  * |
4350  * |- Push harder to find free extents
4351  *    |- If not found, re-iterate all block groups
4352  */
4353 static noinline int find_free_extent(struct btrfs_root *root,
4354                                      struct btrfs_key *ins,
4355                                      struct find_free_extent_ctl *ffe_ctl)
4356 {
4357         struct btrfs_fs_info *fs_info = root->fs_info;
4358         int ret = 0;
4359         int cache_block_group_error = 0;
4360         struct btrfs_block_group *block_group = NULL;
4361         struct btrfs_space_info *space_info;
4362         bool full_search = false;
4363 
4364         WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4365 
4366         ffe_ctl->search_start = 0;
4367         /* For clustered allocation */
4368         ffe_ctl->empty_cluster = 0;
4369         ffe_ctl->last_ptr = NULL;
4370         ffe_ctl->use_cluster = true;
4371         ffe_ctl->have_caching_bg = false;
4372         ffe_ctl->orig_have_caching_bg = false;
4373         ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4374         ffe_ctl->loop = 0;
4375         ffe_ctl->retry_uncached = false;
4376         ffe_ctl->cached = 0;
4377         ffe_ctl->max_extent_size = 0;
4378         ffe_ctl->total_free_space = 0;
4379         ffe_ctl->found_offset = 0;
4380         ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4381         ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4382 
4383         if (btrfs_is_zoned(fs_info))
4384                 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4385 
4386         ins->type = BTRFS_EXTENT_ITEM_KEY;
4387         ins->objectid = 0;
4388         ins->offset = 0;
4389 
4390         trace_find_free_extent(root, ffe_ctl);
4391 
4392         space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4393         if (!space_info) {
4394                 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4395                 return -ENOSPC;
4396         }
4397 
4398         ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4399         if (ret < 0)
4400                 return ret;
4401 
4402         ffe_ctl->search_start = max(ffe_ctl->search_start,
4403                                     first_logical_byte(fs_info));
4404         ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4405         if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4406                 block_group = btrfs_lookup_block_group(fs_info,
4407                                                        ffe_ctl->search_start);
4408                 /*
4409                  * we don't want to use the block group if it doesn't match our
4410                  * allocation bits, or if its not cached.
4411                  *
4412                  * However if we are re-searching with an ideal block group
4413                  * picked out then we don't care that the block group is cached.
4414                  */
4415                 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4416                     block_group->cached != BTRFS_CACHE_NO) {
4417                         down_read(&space_info->groups_sem);
4418                         if (list_empty(&block_group->list) ||
4419                             block_group->ro) {
4420                                 /*
4421                                  * someone is removing this block group,
4422                                  * we can't jump into the have_block_group
4423                                  * target because our list pointers are not
4424                                  * valid
4425                                  */
4426                                 btrfs_put_block_group(block_group);
4427                                 up_read(&space_info->groups_sem);
4428                         } else {
4429                                 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4430                                                         block_group->flags);
4431                                 btrfs_lock_block_group(block_group,
4432                                                        ffe_ctl->delalloc);
4433                                 ffe_ctl->hinted = true;
4434                                 goto have_block_group;
4435                         }
4436                 } else if (block_group) {
4437                         btrfs_put_block_group(block_group);
4438                 }
4439         }
4440 search:
4441         trace_find_free_extent_search_loop(root, ffe_ctl);
4442         ffe_ctl->have_caching_bg = false;
4443         if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4444             ffe_ctl->index == 0)
4445                 full_search = true;
4446         down_read(&space_info->groups_sem);
4447         list_for_each_entry(block_group,
4448                             &space_info->block_groups[ffe_ctl->index], list) {
4449                 struct btrfs_block_group *bg_ret;
4450 
4451                 ffe_ctl->hinted = false;
4452                 /* If the block group is read-only, we can skip it entirely. */
4453                 if (unlikely(block_group->ro)) {
4454                         if (ffe_ctl->for_treelog)
4455                                 btrfs_clear_treelog_bg(block_group);
4456                         if (ffe_ctl->for_data_reloc)
4457                                 btrfs_clear_data_reloc_bg(block_group);
4458                         continue;
4459                 }
4460 
4461                 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4462                 ffe_ctl->search_start = block_group->start;
4463 
4464                 /*
4465                  * this can happen if we end up cycling through all the
4466                  * raid types, but we want to make sure we only allocate
4467                  * for the proper type.
4468                  */
4469                 if (!block_group_bits(block_group, ffe_ctl->flags)) {
4470                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
4471                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
4472                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
4473                                 BTRFS_BLOCK_GROUP_RAID10;
4474 
4475                         /*
4476                          * if they asked for extra copies and this block group
4477                          * doesn't provide them, bail.  This does allow us to
4478                          * fill raid0 from raid1.
4479                          */
4480                         if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4481                                 goto loop;
4482 
4483                         /*
4484                          * This block group has different flags than we want.
4485                          * It's possible that we have MIXED_GROUP flag but no
4486                          * block group is mixed.  Just skip such block group.
4487                          */
4488                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4489                         continue;
4490                 }
4491 
4492 have_block_group:
4493                 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4494                 ffe_ctl->cached = btrfs_block_group_done(block_group);
4495                 if (unlikely(!ffe_ctl->cached)) {
4496                         ffe_ctl->have_caching_bg = true;
4497                         ret = btrfs_cache_block_group(block_group, false);
4498 
4499                         /*
4500                          * If we get ENOMEM here or something else we want to
4501                          * try other block groups, because it may not be fatal.
4502                          * However if we can't find anything else we need to
4503                          * save our return here so that we return the actual
4504                          * error that caused problems, not ENOSPC.
4505                          */
4506                         if (ret < 0) {
4507                                 if (!cache_block_group_error)
4508                                         cache_block_group_error = ret;
4509                                 ret = 0;
4510                                 goto loop;
4511                         }
4512                         ret = 0;
4513                 }
4514 
4515                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4516                         if (!cache_block_group_error)
4517                                 cache_block_group_error = -EIO;
4518                         goto loop;
4519                 }
4520 
4521                 if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4522                         goto loop;
4523 
4524                 bg_ret = NULL;
4525                 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4526                 if (ret > 0)
4527                         goto loop;
4528 
4529                 if (bg_ret && bg_ret != block_group) {
4530                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4531                         block_group = bg_ret;
4532                 }
4533 
4534                 /* Checks */
4535                 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4536                                                  fs_info->stripesize);
4537 
4538                 /* move on to the next group */
4539                 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4540                     block_group->start + block_group->length) {
4541                         btrfs_add_free_space_unused(block_group,
4542                                             ffe_ctl->found_offset,
4543                                             ffe_ctl->num_bytes);
4544                         goto loop;
4545                 }
4546 
4547                 if (ffe_ctl->found_offset < ffe_ctl->search_start)
4548                         btrfs_add_free_space_unused(block_group,
4549                                         ffe_ctl->found_offset,
4550                                         ffe_ctl->search_start - ffe_ctl->found_offset);
4551 
4552                 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4553                                                ffe_ctl->num_bytes,
4554                                                ffe_ctl->delalloc,
4555                                                ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4556                 if (ret == -EAGAIN) {
4557                         btrfs_add_free_space_unused(block_group,
4558                                         ffe_ctl->found_offset,
4559                                         ffe_ctl->num_bytes);
4560                         goto loop;
4561                 }
4562                 btrfs_inc_block_group_reservations(block_group);
4563 
4564                 /* we are all good, lets return */
4565                 ins->objectid = ffe_ctl->search_start;
4566                 ins->offset = ffe_ctl->num_bytes;
4567 
4568                 trace_btrfs_reserve_extent(block_group, ffe_ctl);
4569                 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4570                 break;
4571 loop:
4572                 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4573                     !ffe_ctl->retry_uncached) {
4574                         ffe_ctl->retry_uncached = true;
4575                         btrfs_wait_block_group_cache_progress(block_group,
4576                                                 ffe_ctl->num_bytes +
4577                                                 ffe_ctl->empty_cluster +
4578                                                 ffe_ctl->empty_size);
4579                         goto have_block_group;
4580                 }
4581                 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4582                 cond_resched();
4583         }
4584         up_read(&space_info->groups_sem);
4585 
4586         ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4587         if (ret > 0)
4588                 goto search;
4589 
4590         if (ret == -ENOSPC && !cache_block_group_error) {
4591                 /*
4592                  * Use ffe_ctl->total_free_space as fallback if we can't find
4593                  * any contiguous hole.
4594                  */
4595                 if (!ffe_ctl->max_extent_size)
4596                         ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4597                 spin_lock(&space_info->lock);
4598                 space_info->max_extent_size = ffe_ctl->max_extent_size;
4599                 spin_unlock(&space_info->lock);
4600                 ins->offset = ffe_ctl->max_extent_size;
4601         } else if (ret == -ENOSPC) {
4602                 ret = cache_block_group_error;
4603         }
4604         return ret;
4605 }
4606 
4607 /*
4608  * Entry point to the extent allocator. Tries to find a hole that is at least
4609  * as big as @num_bytes.
4610  *
4611  * @root           -    The root that will contain this extent
4612  *
4613  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4614  *                      is used for accounting purposes. This value differs
4615  *                      from @num_bytes only in the case of compressed extents.
4616  *
4617  * @num_bytes      -    Number of bytes to allocate on-disk.
4618  *
4619  * @min_alloc_size -    Indicates the minimum amount of space that the
4620  *                      allocator should try to satisfy. In some cases
4621  *                      @num_bytes may be larger than what is required and if
4622  *                      the filesystem is fragmented then allocation fails.
4623  *                      However, the presence of @min_alloc_size gives a
4624  *                      chance to try and satisfy the smaller allocation.
4625  *
4626  * @empty_size     -    A hint that you plan on doing more COW. This is the
4627  *                      size in bytes the allocator should try to find free
4628  *                      next to the block it returns.  This is just a hint and
4629  *                      may be ignored by the allocator.
4630  *
4631  * @hint_byte      -    Hint to the allocator to start searching above the byte
4632  *                      address passed. It might be ignored.
4633  *
4634  * @ins            -    This key is modified to record the found hole. It will
4635  *                      have the following values:
4636  *                      ins->objectid == start position
4637  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4638  *                      ins->offset == the size of the hole.
4639  *
4640  * @is_data        -    Boolean flag indicating whether an extent is
4641  *                      allocated for data (true) or metadata (false)
4642  *
4643  * @delalloc       -    Boolean flag indicating whether this allocation is for
4644  *                      delalloc or not. If 'true' data_rwsem of block groups
4645  *                      is going to be acquired.
4646  *
4647  *
4648  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4649  * case -ENOSPC is returned then @ins->offset will contain the size of the
4650  * largest available hole the allocator managed to find.
4651  */
4652 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4653                          u64 num_bytes, u64 min_alloc_size,
4654                          u64 empty_size, u64 hint_byte,
4655                          struct btrfs_key *ins, int is_data, int delalloc)
4656 {
4657         struct btrfs_fs_info *fs_info = root->fs_info;
4658         struct find_free_extent_ctl ffe_ctl = {};
4659         bool final_tried = num_bytes == min_alloc_size;
4660         u64 flags;
4661         int ret;
4662         bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
4663         bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4664 
4665         flags = get_alloc_profile_by_root(root, is_data);
4666 again:
4667         WARN_ON(num_bytes < fs_info->sectorsize);
4668 
4669         ffe_ctl.ram_bytes = ram_bytes;
4670         ffe_ctl.num_bytes = num_bytes;
4671         ffe_ctl.min_alloc_size = min_alloc_size;
4672         ffe_ctl.empty_size = empty_size;
4673         ffe_ctl.flags = flags;
4674         ffe_ctl.delalloc = delalloc;
4675         ffe_ctl.hint_byte = hint_byte;
4676         ffe_ctl.for_treelog = for_treelog;
4677         ffe_ctl.for_data_reloc = for_data_reloc;
4678 
4679         ret = find_free_extent(root, ins, &ffe_ctl);
4680         if (!ret && !is_data) {
4681                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4682         } else if (ret == -ENOSPC) {
4683                 if (!final_tried && ins->offset) {
4684                         num_bytes = min(num_bytes >> 1, ins->offset);
4685                         num_bytes = round_down(num_bytes,
4686                                                fs_info->sectorsize);
4687                         num_bytes = max(num_bytes, min_alloc_size);
4688                         ram_bytes = num_bytes;
4689                         if (num_bytes == min_alloc_size)
4690                                 final_tried = true;
4691                         goto again;
4692                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4693                         struct btrfs_space_info *sinfo;
4694 
4695                         sinfo = btrfs_find_space_info(fs_info, flags);
4696                         btrfs_err(fs_info,
4697         "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4698                                   flags, num_bytes, for_treelog, for_data_reloc);
4699                         if (sinfo)
4700                                 btrfs_dump_space_info(fs_info, sinfo,
4701                                                       num_bytes, 1);
4702                 }
4703         }
4704 
4705         return ret;
4706 }
4707 
4708 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4709                                u64 start, u64 len, int delalloc)
4710 {
4711         struct btrfs_block_group *cache;
4712 
4713         cache = btrfs_lookup_block_group(fs_info, start);
4714         if (!cache) {
4715                 btrfs_err(fs_info, "Unable to find block group for %llu",
4716                           start);
4717                 return -ENOSPC;
4718         }
4719 
4720         btrfs_add_free_space(cache, start, len);
4721         btrfs_free_reserved_bytes(cache, len, delalloc);
4722         trace_btrfs_reserved_extent_free(fs_info, start, len);
4723 
4724         btrfs_put_block_group(cache);
4725         return 0;
4726 }
4727 
4728 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4729                               const struct extent_buffer *eb)
4730 {
4731         struct btrfs_block_group *cache;
4732         int ret = 0;
4733 
4734         cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
4735         if (!cache) {
4736                 btrfs_err(trans->fs_info, "unable to find block group for %llu",
4737                           eb->start);
4738                 return -ENOSPC;
4739         }
4740 
4741         ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
4742         btrfs_put_block_group(cache);
4743         return ret;
4744 }
4745 
4746 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4747                                  u64 num_bytes)
4748 {
4749         struct btrfs_fs_info *fs_info = trans->fs_info;
4750         int ret;
4751 
4752         ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4753         if (ret)
4754                 return ret;
4755 
4756         ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4757         if (ret) {
4758                 ASSERT(!ret);
4759                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4760                           bytenr, num_bytes);
4761                 return ret;
4762         }
4763 
4764         trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4765         return 0;
4766 }
4767 
4768 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4769                                       u64 parent, u64 root_objectid,
4770                                       u64 flags, u64 owner, u64 offset,
4771                                       struct btrfs_key *ins, int ref_mod, u64 oref_root)
4772 {
4773         struct btrfs_fs_info *fs_info = trans->fs_info;
4774         struct btrfs_root *extent_root;
4775         int ret;
4776         struct btrfs_extent_item *extent_item;
4777         struct btrfs_extent_owner_ref *oref;
4778         struct btrfs_extent_inline_ref *iref;
4779         struct btrfs_path *path;
4780         struct extent_buffer *leaf;
4781         int type;
4782         u32 size;
4783         const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
4784 
4785         if (parent > 0)
4786                 type = BTRFS_SHARED_DATA_REF_KEY;
4787         else
4788                 type = BTRFS_EXTENT_DATA_REF_KEY;
4789 
4790         size = sizeof(*extent_item);
4791         if (simple_quota)
4792                 size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4793         size += btrfs_extent_inline_ref_size(type);
4794 
4795         path = btrfs_alloc_path();
4796         if (!path)
4797                 return -ENOMEM;
4798 
4799         extent_root = btrfs_extent_root(fs_info, ins->objectid);
4800         ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4801         if (ret) {
4802                 btrfs_free_path(path);
4803                 return ret;
4804         }
4805 
4806         leaf = path->nodes[0];
4807         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4808                                      struct btrfs_extent_item);
4809         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4810         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4811         btrfs_set_extent_flags(leaf, extent_item,
4812                                flags | BTRFS_EXTENT_FLAG_DATA);
4813 
4814         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4815         if (simple_quota) {
4816                 btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4817                 oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
4818                 btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
4819                 iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4820         }
4821         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4822 
4823         if (parent > 0) {
4824                 struct btrfs_shared_data_ref *ref;
4825                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4826                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4827                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4828         } else {
4829                 struct btrfs_extent_data_ref *ref;
4830                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4831                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4832                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4833                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4834                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4835         }
4836 
4837         btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4838         btrfs_free_path(path);
4839 
4840         return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4841 }
4842 
4843 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4844                                      struct btrfs_delayed_ref_node *node,
4845                                      struct btrfs_delayed_extent_op *extent_op)
4846 {
4847         struct btrfs_fs_info *fs_info = trans->fs_info;
4848         struct btrfs_root *extent_root;
4849         int ret;
4850         struct btrfs_extent_item *extent_item;
4851         struct btrfs_key extent_key;
4852         struct btrfs_tree_block_info *block_info;
4853         struct btrfs_extent_inline_ref *iref;
4854         struct btrfs_path *path;
4855         struct extent_buffer *leaf;
4856         u32 size = sizeof(*extent_item) + sizeof(*iref);
4857         const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
4858         /* The owner of a tree block is the level. */
4859         int level = btrfs_delayed_ref_owner(node);
4860         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4861 
4862         extent_key.objectid = node->bytenr;
4863         if (skinny_metadata) {
4864                 /* The owner of a tree block is the level. */
4865                 extent_key.offset = level;
4866                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4867         } else {
4868                 extent_key.offset = node->num_bytes;
4869                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4870                 size += sizeof(*block_info);
4871         }
4872 
4873         path = btrfs_alloc_path();
4874         if (!path)
4875                 return -ENOMEM;
4876 
4877         extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4878         ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4879                                       size);
4880         if (ret) {
4881                 btrfs_free_path(path);
4882                 return ret;
4883         }
4884 
4885         leaf = path->nodes[0];
4886         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4887                                      struct btrfs_extent_item);
4888         btrfs_set_extent_refs(leaf, extent_item, 1);
4889         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4890         btrfs_set_extent_flags(leaf, extent_item,
4891                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4892 
4893         if (skinny_metadata) {
4894                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4895         } else {
4896                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4897                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4898                 btrfs_set_tree_block_level(leaf, block_info, level);
4899                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4900         }
4901 
4902         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4903                 btrfs_set_extent_inline_ref_type(leaf, iref,
4904                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4905                 btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
4906         } else {
4907                 btrfs_set_extent_inline_ref_type(leaf, iref,
4908                                                  BTRFS_TREE_BLOCK_REF_KEY);
4909                 btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
4910         }
4911 
4912         btrfs_mark_buffer_dirty(trans, leaf);
4913         btrfs_free_path(path);
4914 
4915         return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4916 }
4917 
4918 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4919                                      struct btrfs_root *root, u64 owner,
4920                                      u64 offset, u64 ram_bytes,
4921                                      struct btrfs_key *ins)
4922 {
4923         struct btrfs_ref generic_ref = {
4924                 .action = BTRFS_ADD_DELAYED_EXTENT,
4925                 .bytenr = ins->objectid,
4926                 .num_bytes = ins->offset,
4927                 .owning_root = btrfs_root_id(root),
4928                 .ref_root = btrfs_root_id(root),
4929         };
4930 
4931         ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
4932 
4933         if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4934                 generic_ref.owning_root = root->relocation_src_root;
4935 
4936         btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
4937         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4938 
4939         return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4940 }
4941 
4942 /*
4943  * this is used by the tree logging recovery code.  It records that
4944  * an extent has been allocated and makes sure to clear the free
4945  * space cache bits as well
4946  */
4947 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4948                                    u64 root_objectid, u64 owner, u64 offset,
4949                                    struct btrfs_key *ins)
4950 {
4951         struct btrfs_fs_info *fs_info = trans->fs_info;
4952         int ret;
4953         struct btrfs_block_group *block_group;
4954         struct btrfs_space_info *space_info;
4955         struct btrfs_squota_delta delta = {
4956                 .root = root_objectid,
4957                 .num_bytes = ins->offset,
4958                 .generation = trans->transid,
4959                 .is_data = true,
4960                 .is_inc = true,
4961         };
4962 
4963         /*
4964          * Mixed block groups will exclude before processing the log so we only
4965          * need to do the exclude dance if this fs isn't mixed.
4966          */
4967         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4968                 ret = __exclude_logged_extent(fs_info, ins->objectid,
4969                                               ins->offset);
4970                 if (ret)
4971                         return ret;
4972         }
4973 
4974         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4975         if (!block_group)
4976                 return -EINVAL;
4977 
4978         space_info = block_group->space_info;
4979         spin_lock(&space_info->lock);
4980         spin_lock(&block_group->lock);
4981         space_info->bytes_reserved += ins->offset;
4982         block_group->reserved += ins->offset;
4983         spin_unlock(&block_group->lock);
4984         spin_unlock(&space_info->lock);
4985 
4986         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4987                                          offset, ins, 1, root_objectid);
4988         if (ret)
4989                 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4990         ret = btrfs_record_squota_delta(fs_info, &delta);
4991         btrfs_put_block_group(block_group);
4992         return ret;
4993 }
4994 
4995 #ifdef CONFIG_BTRFS_DEBUG
4996 /*
4997  * Extra safety check in case the extent tree is corrupted and extent allocator
4998  * chooses to use a tree block which is already used and locked.
4999  */
5000 static bool check_eb_lock_owner(const struct extent_buffer *eb)
5001 {
5002         if (eb->lock_owner == current->pid) {
5003                 btrfs_err_rl(eb->fs_info,
5004 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5005                              eb->start, btrfs_header_owner(eb), current->pid);
5006                 return true;
5007         }
5008         return false;
5009 }
5010 #else
5011 static bool check_eb_lock_owner(struct extent_buffer *eb)
5012 {
5013         return false;
5014 }
5015 #endif
5016 
5017 static struct extent_buffer *
5018 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5019                       u64 bytenr, int level, u64 owner,
5020                       enum btrfs_lock_nesting nest)
5021 {
5022         struct btrfs_fs_info *fs_info = root->fs_info;
5023         struct extent_buffer *buf;
5024         u64 lockdep_owner = owner;
5025 
5026         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
5027         if (IS_ERR(buf))
5028                 return buf;
5029 
5030         if (check_eb_lock_owner(buf)) {
5031                 free_extent_buffer(buf);
5032                 return ERR_PTR(-EUCLEAN);
5033         }
5034 
5035         /*
5036          * The reloc trees are just snapshots, so we need them to appear to be
5037          * just like any other fs tree WRT lockdep.
5038          *
5039          * The exception however is in replace_path() in relocation, where we
5040          * hold the lock on the original fs root and then search for the reloc
5041          * root.  At that point we need to make sure any reloc root buffers are
5042          * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5043          * lockdep happy.
5044          */
5045         if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5046             !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5047                 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5048 
5049         /* btrfs_clear_buffer_dirty() accesses generation field. */
5050         btrfs_set_header_generation(buf, trans->transid);
5051 
5052         /*
5053          * This needs to stay, because we could allocate a freed block from an
5054          * old tree into a new tree, so we need to make sure this new block is
5055          * set to the appropriate level and owner.
5056          */
5057         btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5058 
5059         btrfs_tree_lock_nested(buf, nest);
5060         btrfs_clear_buffer_dirty(trans, buf);
5061         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
5062         clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
5063 
5064         set_extent_buffer_uptodate(buf);
5065 
5066         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5067         btrfs_set_header_level(buf, level);
5068         btrfs_set_header_bytenr(buf, buf->start);
5069         btrfs_set_header_generation(buf, trans->transid);
5070         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5071         btrfs_set_header_owner(buf, owner);
5072         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
5073         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
5074         if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
5075                 buf->log_index = root->log_transid % 2;
5076                 /*
5077                  * we allow two log transactions at a time, use different
5078                  * EXTENT bit to differentiate dirty pages.
5079                  */
5080                 if (buf->log_index == 0)
5081                         set_extent_bit(&root->dirty_log_pages, buf->start,
5082                                        buf->start + buf->len - 1,
5083                                        EXTENT_DIRTY, NULL);
5084                 else
5085                         set_extent_bit(&root->dirty_log_pages, buf->start,
5086                                        buf->start + buf->len - 1,
5087                                        EXTENT_NEW, NULL);
5088         } else {
5089                 buf->log_index = -1;
5090                 set_extent_bit(&trans->transaction->dirty_pages, buf->start,
5091                                buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
5092         }
5093         /* this returns a buffer locked for blocking */
5094         return buf;
5095 }
5096 
5097 /*
5098  * finds a free extent and does all the dirty work required for allocation
5099  * returns the tree buffer or an ERR_PTR on error.
5100  */
5101 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
5102                                              struct btrfs_root *root,
5103                                              u64 parent, u64 root_objectid,
5104                                              const struct btrfs_disk_key *key,
5105                                              int level, u64 hint,
5106                                              u64 empty_size,
5107                                              u64 reloc_src_root,
5108                                              enum btrfs_lock_nesting nest)
5109 {
5110         struct btrfs_fs_info *fs_info = root->fs_info;
5111         struct btrfs_key ins;
5112         struct btrfs_block_rsv *block_rsv;
5113         struct extent_buffer *buf;
5114         u64 flags = 0;
5115         int ret;
5116         u32 blocksize = fs_info->nodesize;
5117         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5118         u64 owning_root;
5119 
5120 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5121         if (btrfs_is_testing(fs_info)) {
5122                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
5123                                             level, root_objectid, nest);
5124                 if (!IS_ERR(buf))
5125                         root->alloc_bytenr += blocksize;
5126                 return buf;
5127         }
5128 #endif
5129 
5130         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
5131         if (IS_ERR(block_rsv))
5132                 return ERR_CAST(block_rsv);
5133 
5134         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
5135                                    empty_size, hint, &ins, 0, 0);
5136         if (ret)
5137                 goto out_unuse;
5138 
5139         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
5140                                     root_objectid, nest);
5141         if (IS_ERR(buf)) {
5142                 ret = PTR_ERR(buf);
5143                 goto out_free_reserved;
5144         }
5145         owning_root = btrfs_header_owner(buf);
5146 
5147         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5148                 if (parent == 0)
5149                         parent = ins.objectid;
5150                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5151                 owning_root = reloc_src_root;
5152         } else
5153                 BUG_ON(parent > 0);
5154 
5155         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5156                 struct btrfs_delayed_extent_op *extent_op;
5157                 struct btrfs_ref generic_ref = {
5158                         .action = BTRFS_ADD_DELAYED_EXTENT,
5159                         .bytenr = ins.objectid,
5160                         .num_bytes = ins.offset,
5161                         .parent = parent,
5162                         .owning_root = owning_root,
5163                         .ref_root = root_objectid,
5164                 };
5165 
5166                 if (!skinny_metadata || flags != 0) {
5167                         extent_op = btrfs_alloc_delayed_extent_op();
5168                         if (!extent_op) {
5169                                 ret = -ENOMEM;
5170                                 goto out_free_buf;
5171                         }
5172                         if (key)
5173                                 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5174                         else
5175                                 memset(&extent_op->key, 0, sizeof(extent_op->key));
5176                         extent_op->flags_to_set = flags;
5177                         extent_op->update_key = (skinny_metadata ? false : true);
5178                         extent_op->update_flags = (flags != 0);
5179                 } else {
5180                         extent_op = NULL;
5181                 }
5182 
5183                 btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
5184                 btrfs_ref_tree_mod(fs_info, &generic_ref);
5185                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
5186                 if (ret) {
5187                         btrfs_free_delayed_extent_op(extent_op);
5188                         goto out_free_buf;
5189                 }
5190         }
5191         return buf;
5192 
5193 out_free_buf:
5194         btrfs_tree_unlock(buf);
5195         free_extent_buffer(buf);
5196 out_free_reserved:
5197         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
5198 out_unuse:
5199         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5200         return ERR_PTR(ret);
5201 }
5202 
5203 struct walk_control {
5204         u64 refs[BTRFS_MAX_LEVEL];
5205         u64 flags[BTRFS_MAX_LEVEL];
5206         struct btrfs_key update_progress;
5207         struct btrfs_key drop_progress;
5208         int drop_level;
5209         int stage;
5210         int level;
5211         int shared_level;
5212         int update_ref;
5213         int keep_locks;
5214         int reada_slot;
5215         int reada_count;
5216         int restarted;
5217         /* Indicate that extent info needs to be looked up when walking the tree. */
5218         int lookup_info;
5219 };
5220 
5221 /*
5222  * This is our normal stage.  We are traversing blocks the current snapshot owns
5223  * and we are dropping any of our references to any children we are able to, and
5224  * then freeing the block once we've processed all of the children.
5225  */
5226 #define DROP_REFERENCE  1
5227 
5228 /*
5229  * We enter this stage when we have to walk into a child block (meaning we can't
5230  * simply drop our reference to it from our current parent node) and there are
5231  * more than one reference on it.  If we are the owner of any of the children
5232  * blocks from the current parent node then we have to do the FULL_BACKREF dance
5233  * on them in order to drop our normal ref and add the shared ref.
5234  */
5235 #define UPDATE_BACKREF  2
5236 
5237 /*
5238  * Decide if we need to walk down into this node to adjust the references.
5239  *
5240  * @root:       the root we are currently deleting
5241  * @wc:         the walk control for this deletion
5242  * @eb:         the parent eb that we're currently visiting
5243  * @refs:       the number of refs for wc->level - 1
5244  * @flags:      the flags for wc->level - 1
5245  * @slot:       the slot in the eb that we're currently checking
5246  *
5247  * This is meant to be called when we're evaluating if a node we point to at
5248  * wc->level should be read and walked into, or if we can simply delete our
5249  * reference to it.  We return true if we should walk into the node, false if we
5250  * can skip it.
5251  *
5252  * We have assertions in here to make sure this is called correctly.  We assume
5253  * that sanity checking on the blocks read to this point has been done, so any
5254  * corrupted file systems must have been caught before calling this function.
5255  */
5256 static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
5257                                   struct extent_buffer *eb, u64 refs, u64 flags, int slot)
5258 {
5259         struct btrfs_key key;
5260         u64 generation;
5261         int level = wc->level;
5262 
5263         ASSERT(level > 0);
5264         ASSERT(wc->refs[level - 1] > 0);
5265 
5266         /*
5267          * The update backref stage we only want to skip if we already have
5268          * FULL_BACKREF set, otherwise we need to read.
5269          */
5270         if (wc->stage == UPDATE_BACKREF) {
5271                 if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5272                         return false;
5273                 return true;
5274         }
5275 
5276         /*
5277          * We're the last ref on this block, we must walk into it and process
5278          * any refs it's pointing at.
5279          */
5280         if (wc->refs[level - 1] == 1)
5281                 return true;
5282 
5283         /*
5284          * If we're already FULL_BACKREF then we know we can just drop our
5285          * current reference.
5286          */
5287         if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5288                 return false;
5289 
5290         /*
5291          * This block is older than our creation generation, we can drop our
5292          * reference to it.
5293          */
5294         generation = btrfs_node_ptr_generation(eb, slot);
5295         if (!wc->update_ref || generation <= root->root_key.offset)
5296                 return false;
5297 
5298         /*
5299          * This block was processed from a previous snapshot deletion run, we
5300          * can skip it.
5301          */
5302         btrfs_node_key_to_cpu(eb, &key, slot);
5303         if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
5304                 return false;
5305 
5306         /* All other cases we need to wander into the node. */
5307         return true;
5308 }
5309 
5310 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5311                                      struct btrfs_root *root,
5312                                      struct walk_control *wc,
5313                                      struct btrfs_path *path)
5314 {
5315         struct btrfs_fs_info *fs_info = root->fs_info;
5316         u64 bytenr;
5317         u64 generation;
5318         u64 refs;
5319         u64 flags;
5320         u32 nritems;
5321         struct extent_buffer *eb;
5322         int ret;
5323         int slot;
5324         int nread = 0;
5325 
5326         if (path->slots[wc->level] < wc->reada_slot) {
5327                 wc->reada_count = wc->reada_count * 2 / 3;
5328                 wc->reada_count = max(wc->reada_count, 2);
5329         } else {
5330                 wc->reada_count = wc->reada_count * 3 / 2;
5331                 wc->reada_count = min_t(int, wc->reada_count,
5332                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5333         }
5334 
5335         eb = path->nodes[wc->level];
5336         nritems = btrfs_header_nritems(eb);
5337 
5338         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5339                 if (nread >= wc->reada_count)
5340                         break;
5341 
5342                 cond_resched();
5343                 bytenr = btrfs_node_blockptr(eb, slot);
5344                 generation = btrfs_node_ptr_generation(eb, slot);
5345 
5346                 if (slot == path->slots[wc->level])
5347                         goto reada;
5348 
5349                 if (wc->stage == UPDATE_BACKREF &&
5350                     generation <= root->root_key.offset)
5351                         continue;
5352 
5353                 /* We don't lock the tree block, it's OK to be racy here */
5354                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5355                                                wc->level - 1, 1, &refs,
5356                                                &flags, NULL);
5357                 /* We don't care about errors in readahead. */
5358                 if (ret < 0)
5359                         continue;
5360 
5361                 /*
5362                  * This could be racey, it's conceivable that we raced and end
5363                  * up with a bogus refs count, if that's the case just skip, if
5364                  * we are actually corrupt we will notice when we look up
5365                  * everything again with our locks.
5366                  */
5367                 if (refs == 0)
5368                         continue;
5369 
5370                 /* If we don't need to visit this node don't reada. */
5371                 if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
5372                         continue;
5373 reada:
5374                 btrfs_readahead_node_child(eb, slot);
5375                 nread++;
5376         }
5377         wc->reada_slot = slot;
5378 }
5379 
5380 /*
5381  * helper to process tree block while walking down the tree.
5382  *
5383  * when wc->stage == UPDATE_BACKREF, this function updates
5384  * back refs for pointers in the block.
5385  *
5386  * NOTE: return value 1 means we should stop walking down.
5387  */
5388 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5389                                    struct btrfs_root *root,
5390                                    struct btrfs_path *path,
5391                                    struct walk_control *wc)
5392 {
5393         struct btrfs_fs_info *fs_info = root->fs_info;
5394         int level = wc->level;
5395         struct extent_buffer *eb = path->nodes[level];
5396         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5397         int ret;
5398 
5399         if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
5400                 return 1;
5401 
5402         /*
5403          * when reference count of tree block is 1, it won't increase
5404          * again. once full backref flag is set, we never clear it.
5405          */
5406         if (wc->lookup_info &&
5407             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5408              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5409                 ASSERT(path->locks[level]);
5410                 ret = btrfs_lookup_extent_info(trans, fs_info,
5411                                                eb->start, level, 1,
5412                                                &wc->refs[level],
5413                                                &wc->flags[level],
5414                                                NULL);
5415                 if (ret)
5416                         return ret;
5417                 if (unlikely(wc->refs[level] == 0)) {
5418                         btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5419                                   eb->start);
5420                         return -EUCLEAN;
5421                 }
5422         }
5423 
5424         if (wc->stage == DROP_REFERENCE) {
5425                 if (wc->refs[level] > 1)
5426                         return 1;
5427 
5428                 if (path->locks[level] && !wc->keep_locks) {
5429                         btrfs_tree_unlock_rw(eb, path->locks[level]);
5430                         path->locks[level] = 0;
5431                 }
5432                 return 0;
5433         }
5434 
5435         /* wc->stage == UPDATE_BACKREF */
5436         if (!(wc->flags[level] & flag)) {
5437                 ASSERT(path->locks[level]);
5438                 ret = btrfs_inc_ref(trans, root, eb, 1);
5439                 if (ret) {
5440                         btrfs_abort_transaction(trans, ret);
5441                         return ret;
5442                 }
5443                 ret = btrfs_dec_ref(trans, root, eb, 0);
5444                 if (ret) {
5445                         btrfs_abort_transaction(trans, ret);
5446                         return ret;
5447                 }
5448                 ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5449                 if (ret) {
5450                         btrfs_abort_transaction(trans, ret);
5451                         return ret;
5452                 }
5453                 wc->flags[level] |= flag;
5454         }
5455 
5456         /*
5457          * the block is shared by multiple trees, so it's not good to
5458          * keep the tree lock
5459          */
5460         if (path->locks[level] && level > 0) {
5461                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5462                 path->locks[level] = 0;
5463         }
5464         return 0;
5465 }
5466 
5467 /*
5468  * This is used to verify a ref exists for this root to deal with a bug where we
5469  * would have a drop_progress key that hadn't been updated properly.
5470  */
5471 static int check_ref_exists(struct btrfs_trans_handle *trans,
5472                             struct btrfs_root *root, u64 bytenr, u64 parent,
5473                             int level)
5474 {
5475         struct btrfs_path *path;
5476         struct btrfs_extent_inline_ref *iref;
5477         int ret;
5478 
5479         path = btrfs_alloc_path();
5480         if (!path)
5481                 return -ENOMEM;
5482 
5483         ret = lookup_extent_backref(trans, path, &iref, bytenr,
5484                                     root->fs_info->nodesize, parent,
5485                                     btrfs_root_id(root), level, 0);
5486         btrfs_free_path(path);
5487         if (ret == -ENOENT)
5488                 return 0;
5489         if (ret < 0)
5490                 return ret;
5491         return 1;
5492 }
5493 
5494 /*
5495  * We may not have an uptodate block, so if we are going to walk down into this
5496  * block we need to drop the lock, read it off of the disk, re-lock it and
5497  * return to continue dropping the snapshot.
5498  */
5499 static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
5500                                      struct btrfs_root *root,
5501                                      struct btrfs_path *path,
5502                                      struct walk_control *wc,
5503                                      struct extent_buffer *next)
5504 {
5505         struct btrfs_tree_parent_check check = { 0 };
5506         u64 generation;
5507         int level = wc->level;
5508         int ret;
5509 
5510         btrfs_assert_tree_write_locked(next);
5511 
5512         generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
5513 
5514         if (btrfs_buffer_uptodate(next, generation, 0))
5515                 return 0;
5516 
5517         check.level = level - 1;
5518         check.transid = generation;
5519         check.owner_root = btrfs_root_id(root);
5520         check.has_first_key = true;
5521         btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
5522 
5523         btrfs_tree_unlock(next);
5524         if (level == 1)
5525                 reada_walk_down(trans, root, wc, path);
5526         ret = btrfs_read_extent_buffer(next, &check);
5527         if (ret) {
5528                 free_extent_buffer(next);
5529                 return ret;
5530         }
5531         btrfs_tree_lock(next);
5532         wc->lookup_info = 1;
5533         return 0;
5534 }
5535 
5536 /*
5537  * If we determine that we don't have to visit wc->level - 1 then we need to
5538  * determine if we can drop our reference.
5539  *
5540  * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
5541  *
5542  * If we are DROP_REFERENCE this will figure out if we need to drop our current
5543  * reference, skipping it if we dropped it from a previous incompleted drop, or
5544  * dropping it if we still have a reference to it.
5545  */
5546 static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5547                                 struct btrfs_path *path, struct walk_control *wc,
5548                                 struct extent_buffer *next, u64 owner_root)
5549 {
5550         struct btrfs_ref ref = {
5551                 .action = BTRFS_DROP_DELAYED_REF,
5552                 .bytenr = next->start,
5553                 .num_bytes = root->fs_info->nodesize,
5554                 .owning_root = owner_root,
5555                 .ref_root = btrfs_root_id(root),
5556         };
5557         int level = wc->level;
5558         int ret;
5559 
5560         /* We are UPDATE_BACKREF, we're not dropping anything. */
5561         if (wc->stage == UPDATE_BACKREF)
5562                 return 0;
5563 
5564         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5565                 ref.parent = path->nodes[level]->start;
5566         } else {
5567                 ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
5568                 if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
5569                         btrfs_err(root->fs_info, "mismatched block owner");
5570                         return -EIO;
5571                 }
5572         }
5573 
5574         /*
5575          * If we had a drop_progress we need to verify the refs are set as
5576          * expected.  If we find our ref then we know that from here on out
5577          * everything should be correct, and we can clear the
5578          * ->restarted flag.
5579          */
5580         if (wc->restarted) {
5581                 ret = check_ref_exists(trans, root, next->start, ref.parent,
5582                                        level - 1);
5583                 if (ret <= 0)
5584                         return ret;
5585                 ret = 0;
5586                 wc->restarted = 0;
5587         }
5588 
5589         /*
5590          * Reloc tree doesn't contribute to qgroup numbers, and we have already
5591          * accounted them at merge time (replace_path), thus we could skip
5592          * expensive subtree trace here.
5593          */
5594         if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
5595             wc->refs[level - 1] > 1) {
5596                 u64 generation = btrfs_node_ptr_generation(path->nodes[level],
5597                                                            path->slots[level]);
5598 
5599                 ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
5600                 if (ret) {
5601                         btrfs_err_rl(root->fs_info,
5602 "error %d accounting shared subtree, quota is out of sync, rescan required",
5603                                      ret);
5604                 }
5605         }
5606 
5607         /*
5608          * We need to update the next key in our walk control so we can update
5609          * the drop_progress key accordingly.  We don't care if find_next_key
5610          * doesn't find a key because that means we're at the end and are going
5611          * to clean up now.
5612          */
5613         wc->drop_level = level;
5614         find_next_key(path, level, &wc->drop_progress);
5615 
5616         btrfs_init_tree_ref(&ref, level - 1, 0, false);
5617         return btrfs_free_extent(trans, &ref);
5618 }
5619 
5620 /*
5621  * helper to process tree block pointer.
5622  *
5623  * when wc->stage == DROP_REFERENCE, this function checks
5624  * reference count of the block pointed to. if the block
5625  * is shared and we need update back refs for the subtree
5626  * rooted at the block, this function changes wc->stage to
5627  * UPDATE_BACKREF. if the block is shared and there is no
5628  * need to update back, this function drops the reference
5629  * to the block.
5630  *
5631  * NOTE: return value 1 means we should stop walking down.
5632  */
5633 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5634                                  struct btrfs_root *root,
5635                                  struct btrfs_path *path,
5636                                  struct walk_control *wc)
5637 {
5638         struct btrfs_fs_info *fs_info = root->fs_info;
5639         u64 bytenr;
5640         u64 generation;
5641         u64 owner_root = 0;
5642         struct extent_buffer *next;
5643         int level = wc->level;
5644         int ret = 0;
5645 
5646         generation = btrfs_node_ptr_generation(path->nodes[level],
5647                                                path->slots[level]);
5648         /*
5649          * if the lower level block was created before the snapshot
5650          * was created, we know there is no need to update back refs
5651          * for the subtree
5652          */
5653         if (wc->stage == UPDATE_BACKREF &&
5654             generation <= root->root_key.offset) {
5655                 wc->lookup_info = 1;
5656                 return 1;
5657         }
5658 
5659         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5660 
5661         next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
5662                                             level - 1);
5663         if (IS_ERR(next))
5664                 return PTR_ERR(next);
5665 
5666         btrfs_tree_lock(next);
5667 
5668         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5669                                        &wc->refs[level - 1],
5670                                        &wc->flags[level - 1],
5671                                        &owner_root);
5672         if (ret < 0)
5673                 goto out_unlock;
5674 
5675         if (unlikely(wc->refs[level - 1] == 0)) {
5676                 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5677                           bytenr);
5678                 ret = -EUCLEAN;
5679                 goto out_unlock;
5680         }
5681         wc->lookup_info = 0;
5682 
5683         /* If we don't have to walk into this node skip it. */
5684         if (!visit_node_for_delete(root, wc, path->nodes[level],
5685                                    wc->refs[level - 1], wc->flags[level - 1],
5686                                    path->slots[level]))
5687                 goto skip;
5688 
5689         /*
5690          * We have to walk down into this node, and if we're currently at the
5691          * DROP_REFERNCE stage and this block is shared then we need to switch
5692          * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
5693          */
5694         if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
5695                 wc->stage = UPDATE_BACKREF;
5696                 wc->shared_level = level - 1;
5697         }
5698 
5699         ret = check_next_block_uptodate(trans, root, path, wc, next);
5700         if (ret)
5701                 return ret;
5702 
5703         level--;
5704         ASSERT(level == btrfs_header_level(next));
5705         if (level != btrfs_header_level(next)) {
5706                 btrfs_err(root->fs_info, "mismatched level");
5707                 ret = -EIO;
5708                 goto out_unlock;
5709         }
5710         path->nodes[level] = next;
5711         path->slots[level] = 0;
5712         path->locks[level] = BTRFS_WRITE_LOCK;
5713         wc->level = level;
5714         if (wc->level == 1)
5715                 wc->reada_slot = 0;
5716         return 0;
5717 skip:
5718         ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
5719         if (ret)
5720                 goto out_unlock;
5721         wc->refs[level - 1] = 0;
5722         wc->flags[level - 1] = 0;
5723         wc->lookup_info = 1;
5724         ret = 1;
5725 
5726 out_unlock:
5727         btrfs_tree_unlock(next);
5728         free_extent_buffer(next);
5729 
5730         return ret;
5731 }
5732 
5733 /*
5734  * helper to process tree block while walking up the tree.
5735  *
5736  * when wc->stage == DROP_REFERENCE, this function drops
5737  * reference count on the block.
5738  *
5739  * when wc->stage == UPDATE_BACKREF, this function changes
5740  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5741  * to UPDATE_BACKREF previously while processing the block.
5742  *
5743  * NOTE: return value 1 means we should stop walking up.
5744  */
5745 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5746                                  struct btrfs_root *root,
5747                                  struct btrfs_path *path,
5748                                  struct walk_control *wc)
5749 {
5750         struct btrfs_fs_info *fs_info = root->fs_info;
5751         int ret = 0;
5752         int level = wc->level;
5753         struct extent_buffer *eb = path->nodes[level];
5754         u64 parent = 0;
5755 
5756         if (wc->stage == UPDATE_BACKREF) {
5757                 ASSERT(wc->shared_level >= level);
5758                 if (level < wc->shared_level)
5759                         goto out;
5760 
5761                 ret = find_next_key(path, level + 1, &wc->update_progress);
5762                 if (ret > 0)
5763                         wc->update_ref = 0;
5764 
5765                 wc->stage = DROP_REFERENCE;
5766                 wc->shared_level = -1;
5767                 path->slots[level] = 0;
5768 
5769                 /*
5770                  * check reference count again if the block isn't locked.
5771                  * we should start walking down the tree again if reference
5772                  * count is one.
5773                  */
5774                 if (!path->locks[level]) {
5775                         ASSERT(level > 0);
5776                         btrfs_tree_lock(eb);
5777                         path->locks[level] = BTRFS_WRITE_LOCK;
5778 
5779                         ret = btrfs_lookup_extent_info(trans, fs_info,
5780                                                        eb->start, level, 1,
5781                                                        &wc->refs[level],
5782                                                        &wc->flags[level],
5783                                                        NULL);
5784                         if (ret < 0) {
5785                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5786                                 path->locks[level] = 0;
5787                                 return ret;
5788                         }
5789                         if (unlikely(wc->refs[level] == 0)) {
5790                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5791                                 btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5792                                           eb->start);
5793                                 return -EUCLEAN;
5794                         }
5795                         if (wc->refs[level] == 1) {
5796                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5797                                 path->locks[level] = 0;
5798                                 return 1;
5799                         }
5800                 }
5801         }
5802 
5803         /* wc->stage == DROP_REFERENCE */
5804         ASSERT(path->locks[level] || wc->refs[level] == 1);
5805 
5806         if (wc->refs[level] == 1) {
5807                 if (level == 0) {
5808                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5809                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5810                         else
5811                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5812                         if (ret) {
5813                                 btrfs_abort_transaction(trans, ret);
5814                                 return ret;
5815                         }
5816                         if (is_fstree(btrfs_root_id(root))) {
5817                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5818                                 if (ret) {
5819                                         btrfs_err_rl(fs_info,
5820         "error %d accounting leaf items, quota is out of sync, rescan required",
5821                                              ret);
5822                                 }
5823                         }
5824                 }
5825                 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5826                 if (!path->locks[level]) {
5827                         btrfs_tree_lock(eb);
5828                         path->locks[level] = BTRFS_WRITE_LOCK;
5829                 }
5830                 btrfs_clear_buffer_dirty(trans, eb);
5831         }
5832 
5833         if (eb == root->node) {
5834                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5835                         parent = eb->start;
5836                 else if (btrfs_root_id(root) != btrfs_header_owner(eb))
5837                         goto owner_mismatch;
5838         } else {
5839                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5840                         parent = path->nodes[level + 1]->start;
5841                 else if (btrfs_root_id(root) !=
5842                          btrfs_header_owner(path->nodes[level + 1]))
5843                         goto owner_mismatch;
5844         }
5845 
5846         ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5847                                     wc->refs[level] == 1);
5848         if (ret < 0)
5849                 btrfs_abort_transaction(trans, ret);
5850 out:
5851         wc->refs[level] = 0;
5852         wc->flags[level] = 0;
5853         return ret;
5854 
5855 owner_mismatch:
5856         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5857                      btrfs_header_owner(eb), btrfs_root_id(root));
5858         return -EUCLEAN;
5859 }
5860 
5861 /*
5862  * walk_down_tree consists of two steps.
5863  *
5864  * walk_down_proc().  Look up the reference count and reference of our current
5865  * wc->level.  At this point path->nodes[wc->level] should be populated and
5866  * uptodate, and in most cases should already be locked.  If we are in
5867  * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
5868  * we can walk back up the tree.  If we are UPDATE_BACKREF we have to set
5869  * FULL_BACKREF on this node if it's not already set, and then do the
5870  * FULL_BACKREF conversion dance, which is to drop the root reference and add
5871  * the shared reference to all of this nodes children.
5872  *
5873  * do_walk_down().  This is where we actually start iterating on the children of
5874  * our current path->nodes[wc->level].  For DROP_REFERENCE that means dropping
5875  * our reference to the children that return false from visit_node_for_delete(),
5876  * which has various conditions where we know we can just drop our reference
5877  * without visiting the node.  For UPDATE_BACKREF we will skip any children that
5878  * visit_node_for_delete() returns false for, only walking down when necessary.
5879  * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
5880  * snapshot deletion.
5881  */
5882 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5883                                    struct btrfs_root *root,
5884                                    struct btrfs_path *path,
5885                                    struct walk_control *wc)
5886 {
5887         int level = wc->level;
5888         int ret = 0;
5889 
5890         wc->lookup_info = 1;
5891         while (level >= 0) {
5892                 ret = walk_down_proc(trans, root, path, wc);
5893                 if (ret)
5894                         break;
5895 
5896                 if (level == 0)
5897                         break;
5898 
5899                 if (path->slots[level] >=
5900                     btrfs_header_nritems(path->nodes[level]))
5901                         break;
5902 
5903                 ret = do_walk_down(trans, root, path, wc);
5904                 if (ret > 0) {
5905                         path->slots[level]++;
5906                         continue;
5907                 } else if (ret < 0)
5908                         break;
5909                 level = wc->level;
5910         }
5911         return (ret == 1) ? 0 : ret;
5912 }
5913 
5914 /*
5915  * walk_up_tree() is responsible for making sure we visit every slot on our
5916  * current node, and if we're at the end of that node then we call
5917  * walk_up_proc() on our current node which will do one of a few things based on
5918  * our stage.
5919  *
5920  * UPDATE_BACKREF.  If we wc->level is currently less than our wc->shared_level
5921  * then we need to walk back up the tree, and then going back down into the
5922  * other slots via walk_down_tree to update any other children from our original
5923  * wc->shared_level.  Once we're at or above our wc->shared_level we can switch
5924  * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
5925  *
5926  * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
5927  * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
5928  * in our current leaf.  After that we call btrfs_free_tree_block() on the
5929  * current node and walk up to the next node to walk down the next slot.
5930  */
5931 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5932                                  struct btrfs_root *root,
5933                                  struct btrfs_path *path,
5934                                  struct walk_control *wc, int max_level)
5935 {
5936         int level = wc->level;
5937         int ret;
5938 
5939         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5940         while (level < max_level && path->nodes[level]) {
5941                 wc->level = level;
5942                 if (path->slots[level] + 1 <
5943                     btrfs_header_nritems(path->nodes[level])) {
5944                         path->slots[level]++;
5945                         return 0;
5946                 } else {
5947                         ret = walk_up_proc(trans, root, path, wc);
5948                         if (ret > 0)
5949                                 return 0;
5950                         if (ret < 0)
5951                                 return ret;
5952 
5953                         if (path->locks[level]) {
5954                                 btrfs_tree_unlock_rw(path->nodes[level],
5955                                                      path->locks[level]);
5956                                 path->locks[level] = 0;
5957                         }
5958                         free_extent_buffer(path->nodes[level]);
5959                         path->nodes[level] = NULL;
5960                         level++;
5961                 }
5962         }
5963         return 1;
5964 }
5965 
5966 /*
5967  * drop a subvolume tree.
5968  *
5969  * this function traverses the tree freeing any blocks that only
5970  * referenced by the tree.
5971  *
5972  * when a shared tree block is found. this function decreases its
5973  * reference count by one. if update_ref is true, this function
5974  * also make sure backrefs for the shared block and all lower level
5975  * blocks are properly updated.
5976  *
5977  * If called with for_reloc == 0, may exit early with -EAGAIN
5978  */
5979 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5980 {
5981         const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
5982         struct btrfs_fs_info *fs_info = root->fs_info;
5983         struct btrfs_path *path;
5984         struct btrfs_trans_handle *trans;
5985         struct btrfs_root *tree_root = fs_info->tree_root;
5986         struct btrfs_root_item *root_item = &root->root_item;
5987         struct walk_control *wc;
5988         struct btrfs_key key;
5989         const u64 rootid = btrfs_root_id(root);
5990         int ret = 0;
5991         int level;
5992         bool root_dropped = false;
5993         bool unfinished_drop = false;
5994 
5995         btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
5996 
5997         path = btrfs_alloc_path();
5998         if (!path) {
5999                 ret = -ENOMEM;
6000                 goto out;
6001         }
6002 
6003         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6004         if (!wc) {
6005                 btrfs_free_path(path);
6006                 ret = -ENOMEM;
6007                 goto out;
6008         }
6009 
6010         /*
6011          * Use join to avoid potential EINTR from transaction start. See
6012          * wait_reserve_ticket and the whole reservation callchain.
6013          */
6014         if (for_reloc)
6015                 trans = btrfs_join_transaction(tree_root);
6016         else
6017                 trans = btrfs_start_transaction(tree_root, 0);
6018         if (IS_ERR(trans)) {
6019                 ret = PTR_ERR(trans);
6020                 goto out_free;
6021         }
6022 
6023         ret = btrfs_run_delayed_items(trans);
6024         if (ret)
6025                 goto out_end_trans;
6026 
6027         /*
6028          * This will help us catch people modifying the fs tree while we're
6029          * dropping it.  It is unsafe to mess with the fs tree while it's being
6030          * dropped as we unlock the root node and parent nodes as we walk down
6031          * the tree, assuming nothing will change.  If something does change
6032          * then we'll have stale information and drop references to blocks we've
6033          * already dropped.
6034          */
6035         set_bit(BTRFS_ROOT_DELETING, &root->state);
6036         unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
6037 
6038         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6039                 level = btrfs_header_level(root->node);
6040                 path->nodes[level] = btrfs_lock_root_node(root);
6041                 path->slots[level] = 0;
6042                 path->locks[level] = BTRFS_WRITE_LOCK;
6043                 memset(&wc->update_progress, 0,
6044                        sizeof(wc->update_progress));
6045         } else {
6046                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6047                 memcpy(&wc->update_progress, &key,
6048                        sizeof(wc->update_progress));
6049 
6050                 level = btrfs_root_drop_level(root_item);
6051                 BUG_ON(level == 0);
6052                 path->lowest_level = level;
6053                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6054                 path->lowest_level = 0;
6055                 if (ret < 0)
6056                         goto out_end_trans;
6057 
6058                 WARN_ON(ret > 0);
6059                 ret = 0;
6060 
6061                 /*
6062                  * unlock our path, this is safe because only this
6063                  * function is allowed to delete this snapshot
6064                  */
6065                 btrfs_unlock_up_safe(path, 0);
6066 
6067                 level = btrfs_header_level(root->node);
6068                 while (1) {
6069                         btrfs_tree_lock(path->nodes[level]);
6070                         path->locks[level] = BTRFS_WRITE_LOCK;
6071 
6072                         /*
6073                          * btrfs_lookup_extent_info() returns 0 for success,
6074                          * or < 0 for error.
6075                          */
6076                         ret = btrfs_lookup_extent_info(trans, fs_info,
6077                                                 path->nodes[level]->start,
6078                                                 level, 1, &wc->refs[level],
6079                                                 &wc->flags[level], NULL);
6080                         if (ret < 0)
6081                                 goto out_end_trans;
6082 
6083                         BUG_ON(wc->refs[level] == 0);
6084 
6085                         if (level == btrfs_root_drop_level(root_item))
6086                                 break;
6087 
6088                         btrfs_tree_unlock(path->nodes[level]);
6089                         path->locks[level] = 0;
6090                         WARN_ON(wc->refs[level] != 1);
6091                         level--;
6092                 }
6093         }
6094 
6095         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
6096         wc->level = level;
6097         wc->shared_level = -1;
6098         wc->stage = DROP_REFERENCE;
6099         wc->update_ref = update_ref;
6100         wc->keep_locks = 0;
6101         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6102 
6103         while (1) {
6104 
6105                 ret = walk_down_tree(trans, root, path, wc);
6106                 if (ret < 0) {
6107                         btrfs_abort_transaction(trans, ret);
6108                         break;
6109                 }
6110 
6111                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6112                 if (ret < 0) {
6113                         btrfs_abort_transaction(trans, ret);
6114                         break;
6115                 }
6116 
6117                 if (ret > 0) {
6118                         BUG_ON(wc->stage != DROP_REFERENCE);
6119                         ret = 0;
6120                         break;
6121                 }
6122 
6123                 if (wc->stage == DROP_REFERENCE) {
6124                         wc->drop_level = wc->level;
6125                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
6126                                               &wc->drop_progress,
6127                                               path->slots[wc->drop_level]);
6128                 }
6129                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
6130                                       &wc->drop_progress);
6131                 btrfs_set_root_drop_level(root_item, wc->drop_level);
6132 
6133                 BUG_ON(wc->level == 0);
6134                 if (btrfs_should_end_transaction(trans) ||
6135                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
6136                         ret = btrfs_update_root(trans, tree_root,
6137                                                 &root->root_key,
6138                                                 root_item);
6139                         if (ret) {
6140                                 btrfs_abort_transaction(trans, ret);
6141                                 goto out_end_trans;
6142                         }
6143 
6144                         if (!is_reloc_root)
6145                                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6146 
6147                         btrfs_end_transaction_throttle(trans);
6148                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
6149                                 btrfs_debug(fs_info,
6150                                             "drop snapshot early exit");
6151                                 ret = -EAGAIN;
6152                                 goto out_free;
6153                         }
6154 
6155                        /*
6156                         * Use join to avoid potential EINTR from transaction
6157                         * start. See wait_reserve_ticket and the whole
6158                         * reservation callchain.
6159                         */
6160                         if (for_reloc)
6161                                 trans = btrfs_join_transaction(tree_root);
6162                         else
6163                                 trans = btrfs_start_transaction(tree_root, 0);
6164                         if (IS_ERR(trans)) {
6165                                 ret = PTR_ERR(trans);
6166                                 goto out_free;
6167                         }
6168                 }
6169         }
6170         btrfs_release_path(path);
6171         if (ret)
6172                 goto out_end_trans;
6173 
6174         ret = btrfs_del_root(trans, &root->root_key);
6175         if (ret) {
6176                 btrfs_abort_transaction(trans, ret);
6177                 goto out_end_trans;
6178         }
6179 
6180         if (!is_reloc_root) {
6181                 ret = btrfs_find_root(tree_root, &root->root_key, path,
6182                                       NULL, NULL);
6183                 if (ret < 0) {
6184                         btrfs_abort_transaction(trans, ret);
6185                         goto out_end_trans;
6186                 } else if (ret > 0) {
6187                         ret = 0;
6188                         /*
6189                          * If we fail to delete the orphan item this time
6190                          * around, it'll get picked up the next time.
6191                          *
6192                          * The most common failure here is just -ENOENT.
6193                          */
6194                         btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
6195                 }
6196         }
6197 
6198         /*
6199          * This subvolume is going to be completely dropped, and won't be
6200          * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6201          * commit transaction time.  So free it here manually.
6202          */
6203         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6204         btrfs_qgroup_free_meta_all_pertrans(root);
6205 
6206         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
6207                 btrfs_add_dropped_root(trans, root);
6208         else
6209                 btrfs_put_root(root);
6210         root_dropped = true;
6211 out_end_trans:
6212         if (!is_reloc_root)
6213                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6214 
6215         btrfs_end_transaction_throttle(trans);
6216 out_free:
6217         kfree(wc);
6218         btrfs_free_path(path);
6219 out:
6220         if (!ret && root_dropped) {
6221                 ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
6222                 if (ret < 0)
6223                         btrfs_warn_rl(fs_info,
6224                                       "failed to cleanup qgroup 0/%llu: %d",
6225                                       rootid, ret);
6226                 ret = 0;
6227         }
6228         /*
6229          * We were an unfinished drop root, check to see if there are any
6230          * pending, and if not clear and wake up any waiters.
6231          */
6232         if (!ret && unfinished_drop)
6233                 btrfs_maybe_wake_unfinished_drop(fs_info);
6234 
6235         /*
6236          * So if we need to stop dropping the snapshot for whatever reason we
6237          * need to make sure to add it back to the dead root list so that we
6238          * keep trying to do the work later.  This also cleans up roots if we
6239          * don't have it in the radix (like when we recover after a power fail
6240          * or unmount) so we don't leak memory.
6241          */
6242         if (!for_reloc && !root_dropped)
6243                 btrfs_add_dead_root(root);
6244         return ret;
6245 }
6246 
6247 /*
6248  * drop subtree rooted at tree block 'node'.
6249  *
6250  * NOTE: this function will unlock and release tree block 'node'
6251  * only used by relocation code
6252  */
6253 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6254                         struct btrfs_root *root,
6255                         struct extent_buffer *node,
6256                         struct extent_buffer *parent)
6257 {
6258         struct btrfs_fs_info *fs_info = root->fs_info;
6259         struct btrfs_path *path;
6260         struct walk_control *wc;
6261         int level;
6262         int parent_level;
6263         int ret = 0;
6264 
6265         BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
6266 
6267         path = btrfs_alloc_path();
6268         if (!path)
6269                 return -ENOMEM;
6270 
6271         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6272         if (!wc) {
6273                 btrfs_free_path(path);
6274                 return -ENOMEM;
6275         }
6276 
6277         btrfs_assert_tree_write_locked(parent);
6278         parent_level = btrfs_header_level(parent);
6279         atomic_inc(&parent->refs);
6280         path->nodes[parent_level] = parent;
6281         path->slots[parent_level] = btrfs_header_nritems(parent);
6282 
6283         btrfs_assert_tree_write_locked(node);
6284         level = btrfs_header_level(node);
6285         path->nodes[level] = node;
6286         path->slots[level] = 0;
6287         path->locks[level] = BTRFS_WRITE_LOCK;
6288 
6289         wc->refs[parent_level] = 1;
6290         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6291         wc->level = level;
6292         wc->shared_level = -1;
6293         wc->stage = DROP_REFERENCE;
6294         wc->update_ref = 0;
6295         wc->keep_locks = 1;
6296         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6297 
6298         while (1) {
6299                 ret = walk_down_tree(trans, root, path, wc);
6300                 if (ret < 0)
6301                         break;
6302 
6303                 ret = walk_up_tree(trans, root, path, wc, parent_level);
6304                 if (ret) {
6305                         if (ret > 0)
6306                                 ret = 0;
6307                         break;
6308                 }
6309         }
6310 
6311         kfree(wc);
6312         btrfs_free_path(path);
6313         return ret;
6314 }
6315 
6316 /*
6317  * Unpin the extent range in an error context and don't add the space back.
6318  * Errors are not propagated further.
6319  */
6320 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
6321 {
6322         unpin_extent_range(fs_info, start, end, false);
6323 }
6324 
6325 /*
6326  * It used to be that old block groups would be left around forever.
6327  * Iterating over them would be enough to trim unused space.  Since we
6328  * now automatically remove them, we also need to iterate over unallocated
6329  * space.
6330  *
6331  * We don't want a transaction for this since the discard may take a
6332  * substantial amount of time.  We don't require that a transaction be
6333  * running, but we do need to take a running transaction into account
6334  * to ensure that we're not discarding chunks that were released or
6335  * allocated in the current transaction.
6336  *
6337  * Holding the chunks lock will prevent other threads from allocating
6338  * or releasing chunks, but it won't prevent a running transaction
6339  * from committing and releasing the memory that the pending chunks
6340  * list head uses.  For that, we need to take a reference to the
6341  * transaction and hold the commit root sem.  We only need to hold
6342  * it while performing the free space search since we have already
6343  * held back allocations.
6344  */
6345 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6346 {
6347         u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
6348         int ret;
6349 
6350         *trimmed = 0;
6351 
6352         /* Discard not supported = nothing to do. */
6353         if (!bdev_max_discard_sectors(device->bdev))
6354                 return 0;
6355 
6356         /* Not writable = nothing to do. */
6357         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6358                 return 0;
6359 
6360         /* No free space = nothing to do. */
6361         if (device->total_bytes <= device->bytes_used)
6362                 return 0;
6363 
6364         ret = 0;
6365 
6366         while (1) {
6367                 struct btrfs_fs_info *fs_info = device->fs_info;
6368                 u64 bytes;
6369 
6370                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6371                 if (ret)
6372                         break;
6373 
6374                 find_first_clear_extent_bit(&device->alloc_state, start,
6375                                             &start, &end,
6376                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
6377 
6378                 /* Check if there are any CHUNK_* bits left */
6379                 if (start > device->total_bytes) {
6380                         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6381                         btrfs_warn_in_rcu(fs_info,
6382 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6383                                           start, end - start + 1,
6384                                           btrfs_dev_name(device),
6385                                           device->total_bytes);
6386                         mutex_unlock(&fs_info->chunk_mutex);
6387                         ret = 0;
6388                         break;
6389                 }
6390 
6391                 /* Ensure we skip the reserved space on each device. */
6392                 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6393 
6394                 /*
6395                  * If find_first_clear_extent_bit find a range that spans the
6396                  * end of the device it will set end to -1, in this case it's up
6397                  * to the caller to trim the value to the size of the device.
6398                  */
6399                 end = min(end, device->total_bytes - 1);
6400 
6401                 len = end - start + 1;
6402 
6403                 /* We didn't find any extents */
6404                 if (!len) {
6405                         mutex_unlock(&fs_info->chunk_mutex);
6406                         ret = 0;
6407                         break;
6408                 }
6409 
6410                 ret = btrfs_issue_discard(device->bdev, start, len,
6411                                           &bytes);
6412                 if (!ret)
6413                         set_extent_bit(&device->alloc_state, start,
6414                                        start + bytes - 1, CHUNK_TRIMMED, NULL);
6415                 mutex_unlock(&fs_info->chunk_mutex);
6416 
6417                 if (ret)
6418                         break;
6419 
6420                 start += len;
6421                 *trimmed += bytes;
6422 
6423                 if (fatal_signal_pending(current)) {
6424                         ret = -ERESTARTSYS;
6425                         break;
6426                 }
6427 
6428                 cond_resched();
6429         }
6430 
6431         return ret;
6432 }
6433 
6434 /*
6435  * Trim the whole filesystem by:
6436  * 1) trimming the free space in each block group
6437  * 2) trimming the unallocated space on each device
6438  *
6439  * This will also continue trimming even if a block group or device encounters
6440  * an error.  The return value will be the last error, or 0 if nothing bad
6441  * happens.
6442  */
6443 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6444 {
6445         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6446         struct btrfs_block_group *cache = NULL;
6447         struct btrfs_device *device;
6448         u64 group_trimmed;
6449         u64 range_end = U64_MAX;
6450         u64 start;
6451         u64 end;
6452         u64 trimmed = 0;
6453         u64 bg_failed = 0;
6454         u64 dev_failed = 0;
6455         int bg_ret = 0;
6456         int dev_ret = 0;
6457         int ret = 0;
6458 
6459         if (range->start == U64_MAX)
6460                 return -EINVAL;
6461 
6462         /*
6463          * Check range overflow if range->len is set.
6464          * The default range->len is U64_MAX.
6465          */
6466         if (range->len != U64_MAX &&
6467             check_add_overflow(range->start, range->len, &range_end))
6468                 return -EINVAL;
6469 
6470         cache = btrfs_lookup_first_block_group(fs_info, range->start);
6471         for (; cache; cache = btrfs_next_block_group(cache)) {
6472                 if (cache->start >= range_end) {
6473                         btrfs_put_block_group(cache);
6474                         break;
6475                 }
6476 
6477                 start = max(range->start, cache->start);
6478                 end = min(range_end, cache->start + cache->length);
6479 
6480                 if (end - start >= range->minlen) {
6481                         if (!btrfs_block_group_done(cache)) {
6482                                 ret = btrfs_cache_block_group(cache, true);
6483                                 if (ret) {
6484                                         bg_failed++;
6485                                         bg_ret = ret;
6486                                         continue;
6487                                 }
6488                         }
6489                         ret = btrfs_trim_block_group(cache,
6490                                                      &group_trimmed,
6491                                                      start,
6492                                                      end,
6493                                                      range->minlen);
6494 
6495                         trimmed += group_trimmed;
6496                         if (ret) {
6497                                 bg_failed++;
6498                                 bg_ret = ret;
6499                                 continue;
6500                         }
6501                 }
6502         }
6503 
6504         if (bg_failed)
6505                 btrfs_warn(fs_info,
6506                         "failed to trim %llu block group(s), last error %d",
6507                         bg_failed, bg_ret);
6508 
6509         mutex_lock(&fs_devices->device_list_mutex);
6510         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6511                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6512                         continue;
6513 
6514                 ret = btrfs_trim_free_extents(device, &group_trimmed);
6515                 if (ret) {
6516                         dev_failed++;
6517                         dev_ret = ret;
6518                         break;
6519                 }
6520 
6521                 trimmed += group_trimmed;
6522         }
6523         mutex_unlock(&fs_devices->device_list_mutex);
6524 
6525         if (dev_failed)
6526                 btrfs_warn(fs_info,
6527                         "failed to trim %llu device(s), last error %d",
6528                         dev_failed, dev_ret);
6529         range->len = trimmed;
6530         if (bg_ret)
6531                 return bg_ret;
6532         return dev_ret;
6533 }
6534 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php