~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/snapshot.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include "bcachefs.h"
  4 #include "bkey_buf.h"
  5 #include "btree_key_cache.h"
  6 #include "btree_update.h"
  7 #include "buckets.h"
  8 #include "errcode.h"
  9 #include "error.h"
 10 #include "fs.h"
 11 #include "recovery_passes.h"
 12 #include "snapshot.h"
 13 
 14 #include <linux/random.h>
 15 
 16 /*
 17  * Snapshot trees:
 18  *
 19  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
 20  * exist to provide a stable identifier for the whole lifetime of a snapshot
 21  * tree.
 22  */
 23 
 24 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
 25                                 struct bkey_s_c k)
 26 {
 27         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
 28 
 29         prt_printf(out, "subvol %u root snapshot %u",
 30                    le32_to_cpu(t.v->master_subvol),
 31                    le32_to_cpu(t.v->root_snapshot));
 32 }
 33 
 34 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
 35                                enum bch_validate_flags flags)
 36 {
 37         int ret = 0;
 38 
 39         bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
 40                          bkey_lt(k.k->p, POS(0, 1)),
 41                          c, snapshot_tree_pos_bad,
 42                          "bad pos");
 43 fsck_err:
 44         return ret;
 45 }
 46 
 47 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
 48                               struct bch_snapshot_tree *s)
 49 {
 50         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
 51                                           BTREE_ITER_with_updates, snapshot_tree, s);
 52 
 53         if (bch2_err_matches(ret, ENOENT))
 54                 ret = -BCH_ERR_ENOENT_snapshot_tree;
 55         return ret;
 56 }
 57 
 58 struct bkey_i_snapshot_tree *
 59 __bch2_snapshot_tree_create(struct btree_trans *trans)
 60 {
 61         struct btree_iter iter;
 62         int ret = bch2_bkey_get_empty_slot(trans, &iter,
 63                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
 64         struct bkey_i_snapshot_tree *s_t;
 65 
 66         if (ret == -BCH_ERR_ENOSPC_btree_slot)
 67                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
 68         if (ret)
 69                 return ERR_PTR(ret);
 70 
 71         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
 72         ret = PTR_ERR_OR_ZERO(s_t);
 73         bch2_trans_iter_exit(trans, &iter);
 74         return ret ? ERR_PTR(ret) : s_t;
 75 }
 76 
 77 static int bch2_snapshot_tree_create(struct btree_trans *trans,
 78                                 u32 root_id, u32 subvol_id, u32 *tree_id)
 79 {
 80         struct bkey_i_snapshot_tree *n_tree =
 81                 __bch2_snapshot_tree_create(trans);
 82 
 83         if (IS_ERR(n_tree))
 84                 return PTR_ERR(n_tree);
 85 
 86         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
 87         n_tree->v.root_snapshot = cpu_to_le32(root_id);
 88         *tree_id = n_tree->k.p.offset;
 89         return 0;
 90 }
 91 
 92 /* Snapshot nodes: */
 93 
 94 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
 95 {
 96         while (id && id < ancestor) {
 97                 const struct snapshot_t *s = __snapshot_t(t, id);
 98                 id = s ? s->parent : 0;
 99         }
100         return id == ancestor;
101 }
102 
103 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
104 {
105         rcu_read_lock();
106         bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
107         rcu_read_unlock();
108 
109         return ret;
110 }
111 
112 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
113 {
114         const struct snapshot_t *s = __snapshot_t(t, id);
115         if (!s)
116                 return 0;
117 
118         if (s->skip[2] <= ancestor)
119                 return s->skip[2];
120         if (s->skip[1] <= ancestor)
121                 return s->skip[1];
122         if (s->skip[0] <= ancestor)
123                 return s->skip[0];
124         return s->parent;
125 }
126 
127 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
128 {
129         const struct snapshot_t *s = __snapshot_t(t, id);
130         if (!s)
131                 return false;
132 
133         return test_bit(ancestor - id - 1, s->is_ancestor);
134 }
135 
136 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
137 {
138         bool ret;
139 
140         rcu_read_lock();
141         struct snapshot_table *t = rcu_dereference(c->snapshots);
142 
143         if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
144                 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
145                 goto out;
146         }
147 
148         while (id && id < ancestor - IS_ANCESTOR_BITMAP)
149                 id = get_ancestor_below(t, id, ancestor);
150 
151         ret = id && id < ancestor
152                 ? test_ancestor_bitmap(t, id, ancestor)
153                 : id == ancestor;
154 
155         EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
156 out:
157         rcu_read_unlock();
158 
159         return ret;
160 }
161 
162 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
163 {
164         size_t idx = U32_MAX - id;
165         struct snapshot_table *new, *old;
166 
167         size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
168         size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
169 
170         if (unlikely(new_bytes > INT_MAX))
171                 return NULL;
172 
173         new = kvzalloc(new_bytes, GFP_KERNEL);
174         if (!new)
175                 return NULL;
176 
177         new->nr = new_size;
178 
179         old = rcu_dereference_protected(c->snapshots, true);
180         if (old)
181                 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
182 
183         rcu_assign_pointer(c->snapshots, new);
184         kvfree_rcu(old, rcu);
185 
186         return &rcu_dereference_protected(c->snapshots,
187                                 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
188 }
189 
190 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
191 {
192         size_t idx = U32_MAX - id;
193         struct snapshot_table *table =
194                 rcu_dereference_protected(c->snapshots,
195                                 lockdep_is_held(&c->snapshot_table_lock));
196 
197         lockdep_assert_held(&c->snapshot_table_lock);
198 
199         if (likely(table && idx < table->nr))
200                 return &table->s[idx];
201 
202         return __snapshot_t_mut(c, id);
203 }
204 
205 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
206                            struct bkey_s_c k)
207 {
208         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
209 
210         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
211                BCH_SNAPSHOT_SUBVOL(s.v),
212                BCH_SNAPSHOT_DELETED(s.v),
213                le32_to_cpu(s.v->parent),
214                le32_to_cpu(s.v->children[0]),
215                le32_to_cpu(s.v->children[1]),
216                le32_to_cpu(s.v->subvol),
217                le32_to_cpu(s.v->tree));
218 
219         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
220                 prt_printf(out, " depth %u skiplist %u %u %u",
221                            le32_to_cpu(s.v->depth),
222                            le32_to_cpu(s.v->skip[0]),
223                            le32_to_cpu(s.v->skip[1]),
224                            le32_to_cpu(s.v->skip[2]));
225 }
226 
227 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
228                           enum bch_validate_flags flags)
229 {
230         struct bkey_s_c_snapshot s;
231         u32 i, id;
232         int ret = 0;
233 
234         bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
235                          bkey_lt(k.k->p, POS(0, 1)),
236                          c, snapshot_pos_bad,
237                          "bad pos");
238 
239         s = bkey_s_c_to_snapshot(k);
240 
241         id = le32_to_cpu(s.v->parent);
242         bkey_fsck_err_on(id && id <= k.k->p.offset,
243                          c, snapshot_parent_bad,
244                          "bad parent node (%u <= %llu)",
245                          id, k.k->p.offset);
246 
247         bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
248                          c, snapshot_children_not_normalized,
249                          "children not normalized");
250 
251         bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
252                          c, snapshot_child_duplicate,
253                          "duplicate child nodes");
254 
255         for (i = 0; i < 2; i++) {
256                 id = le32_to_cpu(s.v->children[i]);
257 
258                 bkey_fsck_err_on(id >= k.k->p.offset,
259                                  c, snapshot_child_bad,
260                                  "bad child node (%u >= %llu)",
261                                  id, k.k->p.offset);
262         }
263 
264         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
265                 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
266                                  le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
267                                  c, snapshot_skiplist_not_normalized,
268                                  "skiplist not normalized");
269 
270                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
271                         id = le32_to_cpu(s.v->skip[i]);
272 
273                         bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
274                                          c, snapshot_skiplist_bad,
275                                          "bad skiplist node %u", id);
276                 }
277         }
278 fsck_err:
279         return ret;
280 }
281 
282 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
283 {
284         struct snapshot_t *t = snapshot_t_mut(c, id);
285         u32 parent = id;
286 
287         while ((parent = bch2_snapshot_parent_early(c, parent)) &&
288                parent - id - 1 < IS_ANCESTOR_BITMAP)
289                 __set_bit(parent - id - 1, t->is_ancestor);
290 }
291 
292 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
293 {
294         mutex_lock(&c->snapshot_table_lock);
295         __set_is_ancestor_bitmap(c, id);
296         mutex_unlock(&c->snapshot_table_lock);
297 }
298 
299 static int __bch2_mark_snapshot(struct btree_trans *trans,
300                        enum btree_id btree, unsigned level,
301                        struct bkey_s_c old, struct bkey_s_c new,
302                        enum btree_iter_update_trigger_flags flags)
303 {
304         struct bch_fs *c = trans->c;
305         struct snapshot_t *t;
306         u32 id = new.k->p.offset;
307         int ret = 0;
308 
309         mutex_lock(&c->snapshot_table_lock);
310 
311         t = snapshot_t_mut(c, id);
312         if (!t) {
313                 ret = -BCH_ERR_ENOMEM_mark_snapshot;
314                 goto err;
315         }
316 
317         if (new.k->type == KEY_TYPE_snapshot) {
318                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
319 
320                 t->parent       = le32_to_cpu(s.v->parent);
321                 t->children[0]  = le32_to_cpu(s.v->children[0]);
322                 t->children[1]  = le32_to_cpu(s.v->children[1]);
323                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
324                 t->tree         = le32_to_cpu(s.v->tree);
325 
326                 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
327                         t->depth        = le32_to_cpu(s.v->depth);
328                         t->skip[0]      = le32_to_cpu(s.v->skip[0]);
329                         t->skip[1]      = le32_to_cpu(s.v->skip[1]);
330                         t->skip[2]      = le32_to_cpu(s.v->skip[2]);
331                 } else {
332                         t->depth        = 0;
333                         t->skip[0]      = 0;
334                         t->skip[1]      = 0;
335                         t->skip[2]      = 0;
336                 }
337 
338                 __set_is_ancestor_bitmap(c, id);
339 
340                 if (BCH_SNAPSHOT_DELETED(s.v)) {
341                         set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
342                         if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
343                                 bch2_delete_dead_snapshots_async(c);
344                 }
345         } else {
346                 memset(t, 0, sizeof(*t));
347         }
348 err:
349         mutex_unlock(&c->snapshot_table_lock);
350         return ret;
351 }
352 
353 int bch2_mark_snapshot(struct btree_trans *trans,
354                        enum btree_id btree, unsigned level,
355                        struct bkey_s_c old, struct bkey_s new,
356                        enum btree_iter_update_trigger_flags flags)
357 {
358         return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
359 }
360 
361 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
362                          struct bch_snapshot *s)
363 {
364         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
365                                        BTREE_ITER_with_updates, snapshot, s);
366 }
367 
368 static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
369 {
370         struct bch_snapshot v;
371         int ret;
372 
373         if (!id)
374                 return 0;
375 
376         ret = bch2_snapshot_lookup(trans, id, &v);
377         if (bch2_err_matches(ret, ENOENT))
378                 bch_err(trans->c, "snapshot node %u not found", id);
379         if (ret)
380                 return ret;
381 
382         return !BCH_SNAPSHOT_DELETED(&v);
383 }
384 
385 /*
386  * If @k is a snapshot with just one live child, it's part of a linear chain,
387  * which we consider to be an equivalence class: and then after snapshot
388  * deletion cleanup, there should only be a single key at a given position in
389  * this equivalence class.
390  *
391  * This sets the equivalence class of @k to be the child's equivalence class, if
392  * it's part of such a linear chain: this correctly sets equivalence classes on
393  * startup if we run leaf to root (i.e. in natural key order).
394  */
395 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
396 {
397         struct bch_fs *c = trans->c;
398         unsigned i, nr_live = 0, live_idx = 0;
399         struct bkey_s_c_snapshot snap;
400         u32 id = k.k->p.offset, child[2];
401 
402         if (k.k->type != KEY_TYPE_snapshot)
403                 return 0;
404 
405         snap = bkey_s_c_to_snapshot(k);
406 
407         child[0] = le32_to_cpu(snap.v->children[0]);
408         child[1] = le32_to_cpu(snap.v->children[1]);
409 
410         for (i = 0; i < 2; i++) {
411                 int ret = bch2_snapshot_live(trans, child[i]);
412 
413                 if (ret < 0)
414                         return ret;
415 
416                 if (ret)
417                         live_idx = i;
418                 nr_live += ret;
419         }
420 
421         mutex_lock(&c->snapshot_table_lock);
422 
423         snapshot_t_mut(c, id)->equiv = nr_live == 1
424                 ? snapshot_t_mut(c, child[live_idx])->equiv
425                 : id;
426 
427         mutex_unlock(&c->snapshot_table_lock);
428 
429         return 0;
430 }
431 
432 /* fsck: */
433 
434 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
435 {
436         return snapshot_t(c, id)->children[child];
437 }
438 
439 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
440 {
441         return bch2_snapshot_child(c, id, 0);
442 }
443 
444 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
445 {
446         return bch2_snapshot_child(c, id, 1);
447 }
448 
449 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
450 {
451         u32 n, parent;
452 
453         n = bch2_snapshot_left_child(c, id);
454         if (n)
455                 return n;
456 
457         while ((parent = bch2_snapshot_parent(c, id))) {
458                 n = bch2_snapshot_right_child(c, parent);
459                 if (n && n != id)
460                         return n;
461                 id = parent;
462         }
463 
464         return 0;
465 }
466 
467 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
468 {
469         u32 id = snapshot_root;
470         u32 subvol = 0, s;
471 
472         while (id) {
473                 s = snapshot_t(c, id)->subvol;
474 
475                 if (s && (!subvol || s < subvol))
476                         subvol = s;
477 
478                 id = bch2_snapshot_tree_next(c, id);
479         }
480 
481         return subvol;
482 }
483 
484 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
485                                             u32 snapshot_root, u32 *subvol_id)
486 {
487         struct bch_fs *c = trans->c;
488         struct btree_iter iter;
489         struct bkey_s_c k;
490         bool found = false;
491         int ret;
492 
493         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
494                                      0, k, ret) {
495                 if (k.k->type != KEY_TYPE_subvolume)
496                         continue;
497 
498                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
499                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
500                         continue;
501                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
502                         *subvol_id = s.k->p.offset;
503                         found = true;
504                         break;
505                 }
506         }
507 
508         bch2_trans_iter_exit(trans, &iter);
509 
510         if (!ret && !found) {
511                 struct bkey_i_subvolume *u;
512 
513                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
514 
515                 u = bch2_bkey_get_mut_typed(trans, &iter,
516                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
517                                             0, subvolume);
518                 ret = PTR_ERR_OR_ZERO(u);
519                 if (ret)
520                         return ret;
521 
522                 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
523         }
524 
525         return ret;
526 }
527 
528 static int check_snapshot_tree(struct btree_trans *trans,
529                                struct btree_iter *iter,
530                                struct bkey_s_c k)
531 {
532         struct bch_fs *c = trans->c;
533         struct bkey_s_c_snapshot_tree st;
534         struct bch_snapshot s;
535         struct bch_subvolume subvol;
536         struct printbuf buf = PRINTBUF;
537         u32 root_id;
538         int ret;
539 
540         if (k.k->type != KEY_TYPE_snapshot_tree)
541                 return 0;
542 
543         st = bkey_s_c_to_snapshot_tree(k);
544         root_id = le32_to_cpu(st.v->root_snapshot);
545 
546         ret = bch2_snapshot_lookup(trans, root_id, &s);
547         if (ret && !bch2_err_matches(ret, ENOENT))
548                 goto err;
549 
550         if (fsck_err_on(ret ||
551                         root_id != bch2_snapshot_root(c, root_id) ||
552                         st.k->p.offset != le32_to_cpu(s.tree),
553                         trans, snapshot_tree_to_missing_snapshot,
554                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
555                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
556                 ret = bch2_btree_delete_at(trans, iter, 0);
557                 goto err;
558         }
559 
560         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
561                                  false, 0, &subvol);
562         if (ret && !bch2_err_matches(ret, ENOENT))
563                 goto err;
564 
565         if (fsck_err_on(ret,
566                         trans, snapshot_tree_to_missing_subvol,
567                         "snapshot tree points to missing subvolume:\n  %s",
568                         (printbuf_reset(&buf),
569                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
570             fsck_err_on(!bch2_snapshot_is_ancestor(c,
571                                                 le32_to_cpu(subvol.snapshot),
572                                                 root_id),
573                         trans, snapshot_tree_to_wrong_subvol,
574                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
575                         (printbuf_reset(&buf),
576                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
577             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
578                         trans, snapshot_tree_to_snapshot_subvol,
579                         "snapshot tree points to snapshot subvolume:\n  %s",
580                         (printbuf_reset(&buf),
581                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
582                 struct bkey_i_snapshot_tree *u;
583                 u32 subvol_id;
584 
585                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
586                 bch_err_fn(c, ret);
587 
588                 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
589                         ret = 0;
590                         goto err;
591                 }
592 
593                 if (ret)
594                         goto err;
595 
596                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
597                 ret = PTR_ERR_OR_ZERO(u);
598                 if (ret)
599                         goto err;
600 
601                 u->v.master_subvol = cpu_to_le32(subvol_id);
602                 st = snapshot_tree_i_to_s_c(u);
603         }
604 err:
605 fsck_err:
606         printbuf_exit(&buf);
607         return ret;
608 }
609 
610 /*
611  * For each snapshot_tree, make sure it points to the root of a snapshot tree
612  * and that snapshot entry points back to it, or delete it.
613  *
614  * And, make sure it points to a subvolume within that snapshot tree, or correct
615  * it to point to the oldest subvolume within that snapshot tree.
616  */
617 int bch2_check_snapshot_trees(struct bch_fs *c)
618 {
619         int ret = bch2_trans_run(c,
620                 for_each_btree_key_commit(trans, iter,
621                         BTREE_ID_snapshot_trees, POS_MIN,
622                         BTREE_ITER_prefetch, k,
623                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
624                 check_snapshot_tree(trans, &iter, k)));
625         bch_err_fn(c, ret);
626         return ret;
627 }
628 
629 /*
630  * Look up snapshot tree for @tree_id and find root,
631  * make sure @snap_id is a descendent:
632  */
633 static int snapshot_tree_ptr_good(struct btree_trans *trans,
634                                   u32 snap_id, u32 tree_id)
635 {
636         struct bch_snapshot_tree s_t;
637         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
638 
639         if (bch2_err_matches(ret, ENOENT))
640                 return 0;
641         if (ret)
642                 return ret;
643 
644         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
645 }
646 
647 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
648 {
649         const struct snapshot_t *s;
650 
651         if (!id)
652                 return 0;
653 
654         rcu_read_lock();
655         s = snapshot_t(c, id);
656         if (s->parent)
657                 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
658         rcu_read_unlock();
659 
660         return id;
661 }
662 
663 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
664 {
665         unsigned i;
666 
667         for (i = 0; i < 3; i++)
668                 if (!s.parent) {
669                         if (s.skip[i])
670                                 return false;
671                 } else {
672                         if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
673                                 return false;
674                 }
675 
676         return true;
677 }
678 
679 /*
680  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
681  * its snapshot_tree pointer is correct (allocate new one if necessary), then
682  * update this node's pointer to root node's pointer:
683  */
684 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
685                                     struct btree_iter *iter,
686                                     struct bkey_s_c k,
687                                     struct bch_snapshot *s)
688 {
689         struct bch_fs *c = trans->c;
690         struct btree_iter root_iter;
691         struct bch_snapshot_tree s_t;
692         struct bkey_s_c_snapshot root;
693         struct bkey_i_snapshot *u;
694         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
695         int ret;
696 
697         root = bch2_bkey_get_iter_typed(trans, &root_iter,
698                                BTREE_ID_snapshots, POS(0, root_id),
699                                BTREE_ITER_with_updates, snapshot);
700         ret = bkey_err(root);
701         if (ret)
702                 goto err;
703 
704         tree_id = le32_to_cpu(root.v->tree);
705 
706         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
707         if (ret && !bch2_err_matches(ret, ENOENT))
708                 return ret;
709 
710         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
711                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
712                 ret =   PTR_ERR_OR_ZERO(u) ?:
713                         bch2_snapshot_tree_create(trans, root_id,
714                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
715                                 &tree_id);
716                 if (ret)
717                         goto err;
718 
719                 u->v.tree = cpu_to_le32(tree_id);
720                 if (k.k->p.offset == root_id)
721                         *s = u->v;
722         }
723 
724         if (k.k->p.offset != root_id) {
725                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
726                 ret = PTR_ERR_OR_ZERO(u);
727                 if (ret)
728                         goto err;
729 
730                 u->v.tree = cpu_to_le32(tree_id);
731                 *s = u->v;
732         }
733 err:
734         bch2_trans_iter_exit(trans, &root_iter);
735         return ret;
736 }
737 
738 static int check_snapshot(struct btree_trans *trans,
739                           struct btree_iter *iter,
740                           struct bkey_s_c k)
741 {
742         struct bch_fs *c = trans->c;
743         struct bch_snapshot s;
744         struct bch_subvolume subvol;
745         struct bch_snapshot v;
746         struct bkey_i_snapshot *u;
747         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
748         u32 real_depth;
749         struct printbuf buf = PRINTBUF;
750         u32 i, id;
751         int ret = 0;
752 
753         if (k.k->type != KEY_TYPE_snapshot)
754                 return 0;
755 
756         memset(&s, 0, sizeof(s));
757         memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
758 
759         id = le32_to_cpu(s.parent);
760         if (id) {
761                 ret = bch2_snapshot_lookup(trans, id, &v);
762                 if (bch2_err_matches(ret, ENOENT))
763                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
764                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
765                 if (ret)
766                         goto err;
767 
768                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
769                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
770                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
771                                 id, k.k->p.offset);
772                         ret = -EINVAL;
773                         goto err;
774                 }
775         }
776 
777         for (i = 0; i < 2 && s.children[i]; i++) {
778                 id = le32_to_cpu(s.children[i]);
779 
780                 ret = bch2_snapshot_lookup(trans, id, &v);
781                 if (bch2_err_matches(ret, ENOENT))
782                         bch_err(c, "snapshot node %llu has nonexistent child %u",
783                                 k.k->p.offset, id);
784                 if (ret)
785                         goto err;
786 
787                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
788                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
789                                 id, le32_to_cpu(v.parent), k.k->p.offset);
790                         ret = -EINVAL;
791                         goto err;
792                 }
793         }
794 
795         bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
796                 !BCH_SNAPSHOT_DELETED(&s);
797 
798         if (should_have_subvol) {
799                 id = le32_to_cpu(s.subvol);
800                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
801                 if (bch2_err_matches(ret, ENOENT))
802                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
803                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
804                 if (ret)
805                         goto err;
806 
807                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
808                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
809                                 k.k->p.offset);
810                         ret = -EINVAL;
811                         goto err;
812                 }
813         } else {
814                 if (fsck_err_on(s.subvol,
815                                 trans, snapshot_should_not_have_subvol,
816                                 "snapshot should not point to subvol:\n  %s",
817                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
818                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
819                         ret = PTR_ERR_OR_ZERO(u);
820                         if (ret)
821                                 goto err;
822 
823                         u->v.subvol = 0;
824                         s = u->v;
825                 }
826         }
827 
828         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
829         if (ret < 0)
830                 goto err;
831 
832         if (fsck_err_on(!ret,
833                         trans, snapshot_to_bad_snapshot_tree,
834                         "snapshot points to missing/incorrect tree:\n  %s",
835                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
836                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
837                 if (ret)
838                         goto err;
839         }
840         ret = 0;
841 
842         real_depth = bch2_snapshot_depth(c, parent_id);
843 
844         if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
845                         trans, snapshot_bad_depth,
846                         "snapshot with incorrect depth field, should be %u:\n  %s",
847                         real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
848                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
849                 ret = PTR_ERR_OR_ZERO(u);
850                 if (ret)
851                         goto err;
852 
853                 u->v.depth = cpu_to_le32(real_depth);
854                 s = u->v;
855         }
856 
857         ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
858         if (ret < 0)
859                 goto err;
860 
861         if (fsck_err_on(!ret,
862                         trans, snapshot_bad_skiplist,
863                         "snapshot with bad skiplist field:\n  %s",
864                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
865                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
866                 ret = PTR_ERR_OR_ZERO(u);
867                 if (ret)
868                         goto err;
869 
870                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
871                         u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
872 
873                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
874                 s = u->v;
875         }
876         ret = 0;
877 err:
878 fsck_err:
879         printbuf_exit(&buf);
880         return ret;
881 }
882 
883 int bch2_check_snapshots(struct bch_fs *c)
884 {
885         /*
886          * We iterate backwards as checking/fixing the depth field requires that
887          * the parent's depth already be correct:
888          */
889         int ret = bch2_trans_run(c,
890                 for_each_btree_key_reverse_commit(trans, iter,
891                                 BTREE_ID_snapshots, POS_MAX,
892                                 BTREE_ITER_prefetch, k,
893                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
894                         check_snapshot(trans, &iter, k)));
895         bch_err_fn(c, ret);
896         return ret;
897 }
898 
899 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
900 {
901         struct bch_fs *c = trans->c;
902 
903         if (bch2_snapshot_equiv(c, id))
904                 return 0;
905 
906         /* 0 is an invalid tree ID */
907         u32 tree_id = 0;
908         int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
909         if (ret)
910                 return ret;
911 
912         struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
913         ret = PTR_ERR_OR_ZERO(snapshot);
914         if (ret)
915                 return ret;
916 
917         bkey_snapshot_init(&snapshot->k_i);
918         snapshot->k.p           = POS(0, id);
919         snapshot->v.tree        = cpu_to_le32(tree_id);
920         snapshot->v.btime.lo    = cpu_to_le64(bch2_current_time(c));
921 
922         return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
923                 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
924                                    bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
925                 bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
926 }
927 
928 /* Figure out which snapshot nodes belong in the same tree: */
929 struct snapshot_tree_reconstruct {
930         enum btree_id                   btree;
931         struct bpos                     cur_pos;
932         snapshot_id_list                cur_ids;
933         DARRAY(snapshot_id_list)        trees;
934 };
935 
936 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
937 {
938         darray_for_each(r->trees, i)
939                 darray_exit(i);
940         darray_exit(&r->trees);
941         darray_exit(&r->cur_ids);
942 }
943 
944 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
945 {
946         return r->btree == BTREE_ID_inodes
947                 ? r->cur_pos.offset == pos.offset
948                 : r->cur_pos.inode == pos.inode;
949 }
950 
951 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
952 {
953         darray_for_each(*l, i)
954                 if (snapshot_list_has_id(r, *i))
955                         return true;
956         return false;
957 }
958 
959 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
960 {
961         bool first = true;
962         darray_for_each(*s, i) {
963                 if (!first)
964                         prt_char(out, ' ');
965                 first = false;
966                 prt_printf(out, "%u", *i);
967         }
968 }
969 
970 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
971 {
972         if (r->cur_ids.nr) {
973                 darray_for_each(r->trees, i)
974                         if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
975                                 int ret = snapshot_list_merge(c, i, &r->cur_ids);
976                                 if (ret)
977                                         return ret;
978                                 goto out;
979                         }
980                 darray_push(&r->trees, r->cur_ids);
981                 darray_init(&r->cur_ids);
982         }
983 out:
984         r->cur_ids.nr = 0;
985         return 0;
986 }
987 
988 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
989 {
990         if (!same_snapshot(r, pos))
991                 snapshot_tree_reconstruct_next(c, r);
992         r->cur_pos = pos;
993         return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
994 }
995 
996 int bch2_reconstruct_snapshots(struct bch_fs *c)
997 {
998         struct btree_trans *trans = bch2_trans_get(c);
999         struct printbuf buf = PRINTBUF;
1000         struct snapshot_tree_reconstruct r = {};
1001         int ret = 0;
1002 
1003         for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1004                 if (btree_type_has_snapshots(btree)) {
1005                         r.btree = btree;
1006 
1007                         ret = for_each_btree_key(trans, iter, btree, POS_MIN,
1008                                         BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
1009                                 get_snapshot_trees(c, &r, k.k->p);
1010                         }));
1011                         if (ret)
1012                                 goto err;
1013 
1014                         snapshot_tree_reconstruct_next(c, &r);
1015                 }
1016         }
1017 
1018         darray_for_each(r.trees, t) {
1019                 printbuf_reset(&buf);
1020                 snapshot_id_list_to_text(&buf, t);
1021 
1022                 darray_for_each(*t, id) {
1023                         if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
1024                                         trans, snapshot_node_missing,
1025                                         "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
1026                                 if (t->nr > 1) {
1027                                         bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1028                                         ret = -BCH_ERR_fsck_repair_unimplemented;
1029                                         goto err;
1030                                 }
1031 
1032                                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1033                                                 check_snapshot_exists(trans, *id));
1034                                 if (ret)
1035                                         goto err;
1036                         }
1037                 }
1038         }
1039 fsck_err:
1040 err:
1041         bch2_trans_put(trans);
1042         snapshot_tree_reconstruct_exit(&r);
1043         printbuf_exit(&buf);
1044         bch_err_fn(c, ret);
1045         return ret;
1046 }
1047 
1048 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1049                                 struct btree_iter *iter,
1050                                 struct bkey_s_c k)
1051 {
1052         struct bch_fs *c = trans->c;
1053         struct printbuf buf = PRINTBUF;
1054         int ret = 0;
1055 
1056         if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
1057                         trans, bkey_in_missing_snapshot,
1058                         "key in missing snapshot %s, delete?",
1059                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1060                 ret = bch2_btree_delete_at(trans, iter,
1061                                             BTREE_UPDATE_internal_snapshot_node) ?: 1;
1062 fsck_err:
1063         printbuf_exit(&buf);
1064         return ret;
1065 }
1066 
1067 /*
1068  * Mark a snapshot as deleted, for future cleanup:
1069  */
1070 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1071 {
1072         struct btree_iter iter;
1073         struct bkey_i_snapshot *s;
1074         int ret = 0;
1075 
1076         s = bch2_bkey_get_mut_typed(trans, &iter,
1077                                     BTREE_ID_snapshots, POS(0, id),
1078                                     0, snapshot);
1079         ret = PTR_ERR_OR_ZERO(s);
1080         if (unlikely(ret)) {
1081                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1082                                         trans->c, "missing snapshot %u", id);
1083                 return ret;
1084         }
1085 
1086         /* already deleted? */
1087         if (BCH_SNAPSHOT_DELETED(&s->v))
1088                 goto err;
1089 
1090         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1091         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1092         s->v.subvol = 0;
1093 err:
1094         bch2_trans_iter_exit(trans, &iter);
1095         return ret;
1096 }
1097 
1098 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1099 {
1100         if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1101                 swap(s->children[0], s->children[1]);
1102 }
1103 
1104 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1105 {
1106         struct bch_fs *c = trans->c;
1107         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1108         struct btree_iter c_iter = (struct btree_iter) { NULL };
1109         struct btree_iter tree_iter = (struct btree_iter) { NULL };
1110         struct bkey_s_c_snapshot s;
1111         u32 parent_id, child_id;
1112         unsigned i;
1113         int ret = 0;
1114 
1115         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1116                                      BTREE_ITER_intent, snapshot);
1117         ret = bkey_err(s);
1118         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1119                                 "missing snapshot %u", id);
1120 
1121         if (ret)
1122                 goto err;
1123 
1124         BUG_ON(s.v->children[1]);
1125 
1126         parent_id = le32_to_cpu(s.v->parent);
1127         child_id = le32_to_cpu(s.v->children[0]);
1128 
1129         if (parent_id) {
1130                 struct bkey_i_snapshot *parent;
1131 
1132                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1133                                      BTREE_ID_snapshots, POS(0, parent_id),
1134                                      0, snapshot);
1135                 ret = PTR_ERR_OR_ZERO(parent);
1136                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1137                                         "missing snapshot %u", parent_id);
1138                 if (unlikely(ret))
1139                         goto err;
1140 
1141                 /* find entry in parent->children for node being deleted */
1142                 for (i = 0; i < 2; i++)
1143                         if (le32_to_cpu(parent->v.children[i]) == id)
1144                                 break;
1145 
1146                 if (bch2_fs_inconsistent_on(i == 2, c,
1147                                         "snapshot %u missing child pointer to %u",
1148                                         parent_id, id))
1149                         goto err;
1150 
1151                 parent->v.children[i] = cpu_to_le32(child_id);
1152 
1153                 normalize_snapshot_child_pointers(&parent->v);
1154         }
1155 
1156         if (child_id) {
1157                 struct bkey_i_snapshot *child;
1158 
1159                 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1160                                      BTREE_ID_snapshots, POS(0, child_id),
1161                                      0, snapshot);
1162                 ret = PTR_ERR_OR_ZERO(child);
1163                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1164                                         "missing snapshot %u", child_id);
1165                 if (unlikely(ret))
1166                         goto err;
1167 
1168                 child->v.parent = cpu_to_le32(parent_id);
1169 
1170                 if (!child->v.parent) {
1171                         child->v.skip[0] = 0;
1172                         child->v.skip[1] = 0;
1173                         child->v.skip[2] = 0;
1174                 }
1175         }
1176 
1177         if (!parent_id) {
1178                 /*
1179                  * We're deleting the root of a snapshot tree: update the
1180                  * snapshot_tree entry to point to the new root, or delete it if
1181                  * this is the last snapshot ID in this tree:
1182                  */
1183                 struct bkey_i_snapshot_tree *s_t;
1184 
1185                 BUG_ON(s.v->children[1]);
1186 
1187                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1188                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1189                                 0, snapshot_tree);
1190                 ret = PTR_ERR_OR_ZERO(s_t);
1191                 if (ret)
1192                         goto err;
1193 
1194                 if (s.v->children[0]) {
1195                         s_t->v.root_snapshot = s.v->children[0];
1196                 } else {
1197                         s_t->k.type = KEY_TYPE_deleted;
1198                         set_bkey_val_u64s(&s_t->k, 0);
1199                 }
1200         }
1201 
1202         ret = bch2_btree_delete_at(trans, &iter, 0);
1203 err:
1204         bch2_trans_iter_exit(trans, &tree_iter);
1205         bch2_trans_iter_exit(trans, &p_iter);
1206         bch2_trans_iter_exit(trans, &c_iter);
1207         bch2_trans_iter_exit(trans, &iter);
1208         return ret;
1209 }
1210 
1211 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1212                           u32 *new_snapids,
1213                           u32 *snapshot_subvols,
1214                           unsigned nr_snapids)
1215 {
1216         struct bch_fs *c = trans->c;
1217         struct btree_iter iter;
1218         struct bkey_i_snapshot *n;
1219         struct bkey_s_c k;
1220         unsigned i, j;
1221         u32 depth = bch2_snapshot_depth(c, parent);
1222         int ret;
1223 
1224         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1225                              POS_MIN, BTREE_ITER_intent);
1226         k = bch2_btree_iter_peek(&iter);
1227         ret = bkey_err(k);
1228         if (ret)
1229                 goto err;
1230 
1231         for (i = 0; i < nr_snapids; i++) {
1232                 k = bch2_btree_iter_prev_slot(&iter);
1233                 ret = bkey_err(k);
1234                 if (ret)
1235                         goto err;
1236 
1237                 if (!k.k || !k.k->p.offset) {
1238                         ret = -BCH_ERR_ENOSPC_snapshot_create;
1239                         goto err;
1240                 }
1241 
1242                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1243                 ret = PTR_ERR_OR_ZERO(n);
1244                 if (ret)
1245                         goto err;
1246 
1247                 n->v.flags      = 0;
1248                 n->v.parent     = cpu_to_le32(parent);
1249                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
1250                 n->v.tree       = cpu_to_le32(tree);
1251                 n->v.depth      = cpu_to_le32(depth);
1252                 n->v.btime.lo   = cpu_to_le64(bch2_current_time(c));
1253                 n->v.btime.hi   = 0;
1254 
1255                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1256                         n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1257 
1258                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1259                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1260 
1261                 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1262                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1263                 if (ret)
1264                         goto err;
1265 
1266                 new_snapids[i]  = iter.pos.offset;
1267 
1268                 mutex_lock(&c->snapshot_table_lock);
1269                 snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
1270                 mutex_unlock(&c->snapshot_table_lock);
1271         }
1272 err:
1273         bch2_trans_iter_exit(trans, &iter);
1274         return ret;
1275 }
1276 
1277 /*
1278  * Create new snapshot IDs as children of an existing snapshot ID:
1279  */
1280 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1281                               u32 *new_snapids,
1282                               u32 *snapshot_subvols,
1283                               unsigned nr_snapids)
1284 {
1285         struct btree_iter iter;
1286         struct bkey_i_snapshot *n_parent;
1287         int ret = 0;
1288 
1289         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1290                         BTREE_ID_snapshots, POS(0, parent),
1291                         0, snapshot);
1292         ret = PTR_ERR_OR_ZERO(n_parent);
1293         if (unlikely(ret)) {
1294                 if (bch2_err_matches(ret, ENOENT))
1295                         bch_err(trans->c, "snapshot %u not found", parent);
1296                 return ret;
1297         }
1298 
1299         if (n_parent->v.children[0] || n_parent->v.children[1]) {
1300                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1301                 ret = -EINVAL;
1302                 goto err;
1303         }
1304 
1305         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1306                              new_snapids, snapshot_subvols, nr_snapids);
1307         if (ret)
1308                 goto err;
1309 
1310         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1311         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1312         n_parent->v.subvol = 0;
1313         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1314 err:
1315         bch2_trans_iter_exit(trans, &iter);
1316         return ret;
1317 }
1318 
1319 /*
1320  * Create a snapshot node that is the root of a new tree:
1321  */
1322 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1323                               u32 *new_snapids,
1324                               u32 *snapshot_subvols,
1325                               unsigned nr_snapids)
1326 {
1327         struct bkey_i_snapshot_tree *n_tree;
1328         int ret;
1329 
1330         n_tree = __bch2_snapshot_tree_create(trans);
1331         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1332                 create_snapids(trans, 0, n_tree->k.p.offset,
1333                              new_snapids, snapshot_subvols, nr_snapids);
1334         if (ret)
1335                 return ret;
1336 
1337         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1338         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1339         return 0;
1340 }
1341 
1342 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1343                               u32 *new_snapids,
1344                               u32 *snapshot_subvols,
1345                               unsigned nr_snapids)
1346 {
1347         BUG_ON((parent == 0) != (nr_snapids == 1));
1348         BUG_ON((parent != 0) != (nr_snapids == 2));
1349 
1350         return parent
1351                 ? bch2_snapshot_node_create_children(trans, parent,
1352                                 new_snapids, snapshot_subvols, nr_snapids)
1353                 : bch2_snapshot_node_create_tree(trans,
1354                                 new_snapids, snapshot_subvols, nr_snapids);
1355 
1356 }
1357 
1358 /*
1359  * If we have an unlinked inode in an internal snapshot node, and the inode
1360  * really has been deleted in all child snapshots, how does this get cleaned up?
1361  *
1362  * first there is the problem of how keys that have been overwritten in all
1363  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1364  * special?
1365  *
1366  * also: unlinked inode in internal snapshot appears to not be getting deleted
1367  * correctly if inode doesn't exist in leaf snapshots
1368  *
1369  * solution:
1370  *
1371  * for a key in an interior snapshot node that needs work to be done that
1372  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1373  * that key to snapshot leaf nodes, where we can mutate it
1374  */
1375 
1376 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1377                                struct btree_iter *iter,
1378                                struct bkey_s_c k,
1379                                snapshot_id_list *deleted,
1380                                snapshot_id_list *equiv_seen,
1381                                struct bpos *last_pos)
1382 {
1383         int ret = bch2_check_key_has_snapshot(trans, iter, k);
1384         if (ret)
1385                 return ret < 0 ? ret : 0;
1386 
1387         struct bch_fs *c = trans->c;
1388         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1389         if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
1390                 return 0;
1391 
1392         if (!bkey_eq(k.k->p, *last_pos))
1393                 equiv_seen->nr = 0;
1394 
1395         if (snapshot_list_has_id(deleted, k.k->p.snapshot))
1396                 return bch2_btree_delete_at(trans, iter,
1397                                             BTREE_UPDATE_internal_snapshot_node);
1398 
1399         if (!bpos_eq(*last_pos, k.k->p) &&
1400             snapshot_list_has_id(equiv_seen, equiv))
1401                 return bch2_btree_delete_at(trans, iter,
1402                                             BTREE_UPDATE_internal_snapshot_node);
1403 
1404         *last_pos = k.k->p;
1405 
1406         ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
1407         if (ret)
1408                 return ret;
1409 
1410         /*
1411          * When we have a linear chain of snapshot nodes, we consider
1412          * those to form an equivalence class: we're going to collapse
1413          * them all down to a single node, and keep the leaf-most node -
1414          * which has the same id as the equivalence class id.
1415          *
1416          * If there are multiple keys in different snapshots at the same
1417          * position, we're only going to keep the one in the newest
1418          * snapshot (we delete the others above) - the rest have been
1419          * overwritten and are redundant, and for the key we're going to keep we
1420          * need to move it to the equivalance class ID if it's not there
1421          * already.
1422          */
1423         if (equiv != k.k->p.snapshot) {
1424                 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1425                 int ret = PTR_ERR_OR_ZERO(new);
1426                 if (ret)
1427                         return ret;
1428 
1429                 new->k.p.snapshot = equiv;
1430 
1431                 struct btree_iter new_iter;
1432                 bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
1433                                      BTREE_ITER_all_snapshots|
1434                                      BTREE_ITER_cached|
1435                                      BTREE_ITER_intent);
1436 
1437                 ret =   bch2_btree_iter_traverse(&new_iter) ?:
1438                         bch2_trans_update(trans, &new_iter, new,
1439                                         BTREE_UPDATE_internal_snapshot_node) ?:
1440                         bch2_btree_delete_at(trans, iter,
1441                                         BTREE_UPDATE_internal_snapshot_node);
1442                 bch2_trans_iter_exit(trans, &new_iter);
1443                 if (ret)
1444                         return ret;
1445         }
1446 
1447         return 0;
1448 }
1449 
1450 static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
1451 {
1452         struct bkey_s_c_snapshot snap;
1453         u32 children[2];
1454         int ret;
1455 
1456         if (k.k->type != KEY_TYPE_snapshot)
1457                 return 0;
1458 
1459         snap = bkey_s_c_to_snapshot(k);
1460         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1461             BCH_SNAPSHOT_SUBVOL(snap.v))
1462                 return 0;
1463 
1464         children[0] = le32_to_cpu(snap.v->children[0]);
1465         children[1] = le32_to_cpu(snap.v->children[1]);
1466 
1467         ret   = bch2_snapshot_live(trans, children[0]) ?:
1468                 bch2_snapshot_live(trans, children[1]);
1469         if (ret < 0)
1470                 return ret;
1471         return !ret;
1472 }
1473 
1474 /*
1475  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1476  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1477  * as deleted.
1478  */
1479 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1480 {
1481         int ret = bch2_snapshot_needs_delete(trans, k);
1482 
1483         return ret <= 0
1484                 ? ret
1485                 : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1486 }
1487 
1488 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1489                                                 snapshot_id_list *skip)
1490 {
1491         rcu_read_lock();
1492         while (snapshot_list_has_id(skip, id))
1493                 id = __bch2_snapshot_parent(c, id);
1494 
1495         while (n--) {
1496                 do {
1497                         id = __bch2_snapshot_parent(c, id);
1498                 } while (snapshot_list_has_id(skip, id));
1499         }
1500         rcu_read_unlock();
1501 
1502         return id;
1503 }
1504 
1505 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1506                                               struct btree_iter *iter, struct bkey_s_c k,
1507                                               snapshot_id_list *deleted)
1508 {
1509         struct bch_fs *c = trans->c;
1510         u32 nr_deleted_ancestors = 0;
1511         struct bkey_i_snapshot *s;
1512         int ret;
1513 
1514         if (k.k->type != KEY_TYPE_snapshot)
1515                 return 0;
1516 
1517         if (snapshot_list_has_id(deleted, k.k->p.offset))
1518                 return 0;
1519 
1520         s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1521         ret = PTR_ERR_OR_ZERO(s);
1522         if (ret)
1523                 return ret;
1524 
1525         darray_for_each(*deleted, i)
1526                 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
1527 
1528         if (!nr_deleted_ancestors)
1529                 return 0;
1530 
1531         le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1532 
1533         if (!s->v.depth) {
1534                 s->v.skip[0] = 0;
1535                 s->v.skip[1] = 0;
1536                 s->v.skip[2] = 0;
1537         } else {
1538                 u32 depth = le32_to_cpu(s->v.depth);
1539                 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1540 
1541                 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1542                         u32 id = le32_to_cpu(s->v.skip[j]);
1543 
1544                         if (snapshot_list_has_id(deleted, id)) {
1545                                 id = bch2_snapshot_nth_parent_skip(c,
1546                                                         parent,
1547                                                         depth > 1
1548                                                         ? get_random_u32_below(depth - 1)
1549                                                         : 0,
1550                                                         deleted);
1551                                 s->v.skip[j] = cpu_to_le32(id);
1552                         }
1553                 }
1554 
1555                 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1556         }
1557 
1558         return bch2_trans_update(trans, iter, &s->k_i, 0);
1559 }
1560 
1561 int bch2_delete_dead_snapshots(struct bch_fs *c)
1562 {
1563         struct btree_trans *trans;
1564         snapshot_id_list deleted = { 0 };
1565         snapshot_id_list deleted_interior = { 0 };
1566         int ret = 0;
1567 
1568         if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1569                 return 0;
1570 
1571         trans = bch2_trans_get(c);
1572 
1573         /*
1574          * For every snapshot node: If we have no live children and it's not
1575          * pointed to by a subvolume, delete it:
1576          */
1577         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
1578                         POS_MIN, 0, k,
1579                         NULL, NULL, 0,
1580                 bch2_delete_redundant_snapshot(trans, k));
1581         bch_err_msg(c, ret, "deleting redundant snapshots");
1582         if (ret)
1583                 goto err;
1584 
1585         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1586                                  POS_MIN, 0, k,
1587                 bch2_snapshot_set_equiv(trans, k));
1588         bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
1589         if (ret)
1590                 goto err;
1591 
1592         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1593                                  POS_MIN, 0, k, ({
1594                 if (k.k->type != KEY_TYPE_snapshot)
1595                         continue;
1596 
1597                 BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
1598                         ? snapshot_list_add(c, &deleted, k.k->p.offset)
1599                         : 0;
1600         }));
1601         bch_err_msg(c, ret, "walking snapshots");
1602         if (ret)
1603                 goto err;
1604 
1605         for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1606                 struct bpos last_pos = POS_MIN;
1607                 snapshot_id_list equiv_seen = { 0 };
1608                 struct disk_reservation res = { 0 };
1609 
1610                 if (!btree_type_has_snapshots(btree))
1611                         continue;
1612 
1613                 ret = for_each_btree_key_commit(trans, iter,
1614                                 btree, POS_MIN,
1615                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1616                                 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1617                         delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
1618                                                           &equiv_seen, &last_pos));
1619 
1620                 bch2_disk_reservation_put(c, &res);
1621                 darray_exit(&equiv_seen);
1622 
1623                 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1624                 if (ret)
1625                         goto err;
1626         }
1627 
1628         bch2_trans_unlock(trans);
1629         down_write(&c->snapshot_create_lock);
1630 
1631         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1632                                  POS_MIN, 0, k, ({
1633                 u32 snapshot = k.k->p.offset;
1634                 u32 equiv = bch2_snapshot_equiv(c, snapshot);
1635 
1636                 equiv != snapshot
1637                         ? snapshot_list_add(c, &deleted_interior, snapshot)
1638                         : 0;
1639         }));
1640 
1641         bch_err_msg(c, ret, "walking snapshots");
1642         if (ret)
1643                 goto err_create_lock;
1644 
1645         /*
1646          * Fixing children of deleted snapshots can't be done completely
1647          * atomically, if we crash between here and when we delete the interior
1648          * nodes some depth fields will be off:
1649          */
1650         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1651                                   BTREE_ITER_intent, k,
1652                                   NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1653                 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
1654         if (ret)
1655                 goto err_create_lock;
1656 
1657         darray_for_each(deleted, i) {
1658                 ret = commit_do(trans, NULL, NULL, 0,
1659                         bch2_snapshot_node_delete(trans, *i));
1660                 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1661                 if (ret)
1662                         goto err_create_lock;
1663         }
1664 
1665         darray_for_each(deleted_interior, i) {
1666                 ret = commit_do(trans, NULL, NULL, 0,
1667                         bch2_snapshot_node_delete(trans, *i));
1668                 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1669                 if (ret)
1670                         goto err_create_lock;
1671         }
1672 err_create_lock:
1673         up_write(&c->snapshot_create_lock);
1674 err:
1675         darray_exit(&deleted_interior);
1676         darray_exit(&deleted);
1677         bch2_trans_put(trans);
1678         bch_err_fn(c, ret);
1679         return ret;
1680 }
1681 
1682 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1683 {
1684         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1685 
1686         set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1687 
1688         bch2_delete_dead_snapshots(c);
1689         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1690 }
1691 
1692 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1693 {
1694         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1695             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1696                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1697 }
1698 
1699 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1700                                        enum btree_id id,
1701                                        struct bpos pos)
1702 {
1703         struct bch_fs *c = trans->c;
1704         struct btree_iter iter;
1705         struct bkey_s_c k;
1706         int ret;
1707 
1708         bch2_trans_iter_init(trans, &iter, id, pos,
1709                              BTREE_ITER_not_extents|
1710                              BTREE_ITER_all_snapshots);
1711         while (1) {
1712                 k = bch2_btree_iter_prev(&iter);
1713                 ret = bkey_err(k);
1714                 if (ret)
1715                         break;
1716 
1717                 if (!k.k)
1718                         break;
1719 
1720                 if (!bkey_eq(pos, k.k->p))
1721                         break;
1722 
1723                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1724                         ret = 1;
1725                         break;
1726                 }
1727         }
1728         bch2_trans_iter_exit(trans, &iter);
1729 
1730         return ret;
1731 }
1732 
1733 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
1734 {
1735         const struct snapshot_t *s = snapshot_t(c, id);
1736 
1737         return s->children[1] ?: s->children[0];
1738 }
1739 
1740 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
1741 {
1742         u32 child;
1743 
1744         while ((child = bch2_snapshot_smallest_child(c, id)))
1745                 id = child;
1746         return id;
1747 }
1748 
1749 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
1750                                                enum btree_id btree,
1751                                                struct bkey_s_c interior_k,
1752                                                u32 leaf_id, struct bpos *new_min_pos)
1753 {
1754         struct btree_iter iter;
1755         struct bpos pos = interior_k.k->p;
1756         struct bkey_s_c k;
1757         struct bkey_i *new;
1758         int ret;
1759 
1760         pos.snapshot = leaf_id;
1761 
1762         bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
1763         k = bch2_btree_iter_peek_slot(&iter);
1764         ret = bkey_err(k);
1765         if (ret)
1766                 goto out;
1767 
1768         /* key already overwritten in this snapshot? */
1769         if (k.k->p.snapshot != interior_k.k->p.snapshot)
1770                 goto out;
1771 
1772         if (bpos_eq(*new_min_pos, POS_MIN)) {
1773                 *new_min_pos = k.k->p;
1774                 new_min_pos->snapshot = leaf_id;
1775         }
1776 
1777         new = bch2_bkey_make_mut_noupdate(trans, interior_k);
1778         ret = PTR_ERR_OR_ZERO(new);
1779         if (ret)
1780                 goto out;
1781 
1782         new->k.p.snapshot = leaf_id;
1783         ret = bch2_trans_update(trans, &iter, new, 0);
1784 out:
1785         bch2_trans_iter_exit(trans, &iter);
1786         return ret;
1787 }
1788 
1789 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
1790                                           enum btree_id btree,
1791                                           struct bkey_s_c k,
1792                                           struct bpos *new_min_pos)
1793 {
1794         struct bch_fs *c = trans->c;
1795         struct bkey_buf sk;
1796         u32 restart_count = trans->restart_count;
1797         int ret = 0;
1798 
1799         bch2_bkey_buf_init(&sk);
1800         bch2_bkey_buf_reassemble(&sk, c, k);
1801         k = bkey_i_to_s_c(sk.k);
1802 
1803         *new_min_pos = POS_MIN;
1804 
1805         for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
1806              id < k.k->p.snapshot;
1807              id++) {
1808                 if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
1809                     !bch2_snapshot_is_leaf(c, id))
1810                         continue;
1811 again:
1812                 ret =   btree_trans_too_many_iters(trans) ?:
1813                         bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
1814                         bch2_trans_commit(trans, NULL, NULL, 0);
1815                 if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1816                         bch2_trans_begin(trans);
1817                         goto again;
1818                 }
1819 
1820                 if (ret)
1821                         break;
1822         }
1823 
1824         bch2_bkey_buf_exit(&sk, c);
1825 
1826         return ret ?: trans_was_restarted(trans, restart_count);
1827 }
1828 
1829 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1830 {
1831         struct bch_fs *c = trans->c;
1832         struct bkey_s_c_snapshot snap;
1833         int ret = 0;
1834 
1835         if (k.k->type != KEY_TYPE_snapshot)
1836                 return 0;
1837 
1838         snap = bkey_s_c_to_snapshot(k);
1839         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1840             bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
1841             (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
1842                 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
1843                 return 0;
1844         }
1845 
1846         return ret;
1847 }
1848 
1849 int bch2_snapshots_read(struct bch_fs *c)
1850 {
1851         int ret = bch2_trans_run(c,
1852                 for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1853                                    POS_MIN, 0, k,
1854                         __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1855                         bch2_snapshot_set_equiv(trans, k) ?:
1856                         bch2_check_snapshot_needs_deletion(trans, k)) ?:
1857                 for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1858                                    POS_MIN, 0, k,
1859                            (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
1860         bch_err_fn(c, ret);
1861 
1862         /*
1863          * It's important that we check if we need to reconstruct snapshots
1864          * before going RW, so we mark that pass as required in the superblock -
1865          * otherwise, we could end up deleting keys with missing snapshot nodes
1866          * instead
1867          */
1868         BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1869                test_bit(BCH_FS_may_go_rw, &c->flags));
1870 
1871         if (bch2_err_matches(ret, EIO) ||
1872             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1873                 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1874 
1875         return ret;
1876 }
1877 
1878 void bch2_fs_snapshots_exit(struct bch_fs *c)
1879 {
1880         kvfree(rcu_dereference_protected(c->snapshots, true));
1881 }
1882 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php