~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/fsck.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include "bcachefs.h"
  4 #include "bkey_buf.h"
  5 #include "btree_cache.h"
  6 #include "btree_update.h"
  7 #include "buckets.h"
  8 #include "darray.h"
  9 #include "dirent.h"
 10 #include "error.h"
 11 #include "fs-common.h"
 12 #include "fsck.h"
 13 #include "inode.h"
 14 #include "keylist.h"
 15 #include "recovery_passes.h"
 16 #include "snapshot.h"
 17 #include "super.h"
 18 #include "xattr.h"
 19 
 20 #include <linux/bsearch.h>
 21 #include <linux/dcache.h> /* struct qstr */
 22 
 23 /*
 24  * XXX: this is handling transaction restarts without returning
 25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
 26  */
 27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
 28                                     u32 snapshot)
 29 {
 30         u64 sectors = 0;
 31 
 32         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
 33                                 SPOS(inum, 0, snapshot),
 34                                 POS(inum, U64_MAX),
 35                                 0, k, ({
 36                 if (bkey_extent_is_allocation(k.k))
 37                         sectors += k.k->size;
 38                 0;
 39         }));
 40 
 41         return ret ?: sectors;
 42 }
 43 
 44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
 45                                     u32 snapshot)
 46 {
 47         u64 subdirs = 0;
 48 
 49         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
 50                                     SPOS(inum, 0, snapshot),
 51                                     POS(inum, U64_MAX),
 52                                     0, k, ({
 53                 if (k.k->type == KEY_TYPE_dirent &&
 54                     bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
 55                         subdirs++;
 56                 0;
 57         }));
 58 
 59         return ret ?: subdirs;
 60 }
 61 
 62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
 63                          u32 *snapshot, u64 *inum)
 64 {
 65         struct bch_subvolume s;
 66         int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
 67 
 68         *snapshot = le32_to_cpu(s.snapshot);
 69         *inum = le64_to_cpu(s.inode);
 70         return ret;
 71 }
 72 
 73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 74                               struct bch_inode_unpacked *inode)
 75 {
 76         struct btree_iter iter;
 77         struct bkey_s_c k;
 78         int ret;
 79 
 80         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
 81                                      BTREE_ITER_all_snapshots, k, ret) {
 82                 if (k.k->p.offset != inode_nr)
 83                         break;
 84                 if (!bkey_is_inode(k.k))
 85                         continue;
 86                 ret = bch2_inode_unpack(k, inode);
 87                 goto found;
 88         }
 89         ret = -BCH_ERR_ENOENT_inode;
 90 found:
 91         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
 92         bch2_trans_iter_exit(trans, &iter);
 93         return ret;
 94 }
 95 
 96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
 97                         struct bch_inode_unpacked *inode,
 98                         u32 *snapshot)
 99 {
100         struct btree_iter iter;
101         struct bkey_s_c k;
102         int ret;
103 
104         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
105                                SPOS(0, inode_nr, *snapshot), 0);
106         ret = bkey_err(k);
107         if (ret)
108                 goto err;
109 
110         ret = bkey_is_inode(k.k)
111                 ? bch2_inode_unpack(k, inode)
112                 : -BCH_ERR_ENOENT_inode;
113         if (!ret)
114                 *snapshot = iter.pos.snapshot;
115 err:
116         bch2_trans_iter_exit(trans, &iter);
117         return ret;
118 }
119 
120 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
121                            struct bch_hash_info hash_info,
122                            subvol_inum dir, struct qstr *name,
123                            u64 *target, unsigned *type, u32 snapshot)
124 {
125         struct btree_iter iter;
126         struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
127                                                          &hash_info, dir, name, 0, snapshot);
128         int ret = bkey_err(k);
129         if (ret)
130                 return ret;
131 
132         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
133         *target = le64_to_cpu(d.v->d_inum);
134         *type = d.v->d_type;
135         bch2_trans_iter_exit(trans, &iter);
136         return 0;
137 }
138 
139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
140 {
141         struct bch_fs *c = trans->c;
142         struct btree_iter iter;
143         struct bch_inode_unpacked dir_inode;
144         struct bch_hash_info dir_hash_info;
145         int ret;
146 
147         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
148         if (ret)
149                 goto err;
150 
151         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
152 
153         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
154 
155         ret =   bch2_btree_iter_traverse(&iter) ?:
156                 bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
157                                     &dir_hash_info, &iter,
158                                     BTREE_UPDATE_internal_snapshot_node);
159         bch2_trans_iter_exit(trans, &iter);
160 err:
161         bch_err_fn(c, ret);
162         return ret;
163 }
164 
165 /* Get lost+found, create if it doesn't exist: */
166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
167                             struct bch_inode_unpacked *lostfound,
168                             u64 reattaching_inum)
169 {
170         struct bch_fs *c = trans->c;
171         struct qstr lostfound_str = QSTR("lost+found");
172         u64 inum = 0;
173         unsigned d_type = 0;
174         int ret;
175 
176         struct bch_snapshot_tree st;
177         ret = bch2_snapshot_tree_lookup(trans,
178                         bch2_snapshot_tree(c, snapshot), &st);
179         if (ret)
180                 return ret;
181 
182         subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
183 
184         struct bch_subvolume subvol;
185         ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
186                                  false, 0, &subvol);
187         bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
188                     le32_to_cpu(st.master_subvol), snapshot);
189         if (ret)
190                 return ret;
191 
192         if (!subvol.inode) {
193                 struct btree_iter iter;
194                 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
195                                 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
196                                 0, subvolume);
197                 ret = PTR_ERR_OR_ZERO(subvol);
198                 if (ret)
199                         return ret;
200 
201                 subvol->v.inode = cpu_to_le64(reattaching_inum);
202                 bch2_trans_iter_exit(trans, &iter);
203         }
204 
205         root_inum.inum = le64_to_cpu(subvol.inode);
206 
207         struct bch_inode_unpacked root_inode;
208         struct bch_hash_info root_hash_info;
209         u32 root_inode_snapshot = snapshot;
210         ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
211         bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
212                     root_inum.inum, le32_to_cpu(st.master_subvol));
213         if (ret)
214                 return ret;
215 
216         root_hash_info = bch2_hash_info_init(c, &root_inode);
217 
218         ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
219                               &lostfound_str, &inum, &d_type, snapshot);
220         if (bch2_err_matches(ret, ENOENT))
221                 goto create_lostfound;
222 
223         bch_err_fn(c, ret);
224         if (ret)
225                 return ret;
226 
227         if (d_type != DT_DIR) {
228                 bch_err(c, "error looking up lost+found: not a directory");
229                 return -BCH_ERR_ENOENT_not_directory;
230         }
231 
232         /*
233          * The bch2_check_dirents pass has already run, dangling dirents
234          * shouldn't exist here:
235          */
236         ret = lookup_inode(trans, inum, lostfound, &snapshot);
237         bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
238                     inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
239         return ret;
240 
241 create_lostfound:
242         /*
243          * XXX: we could have a nicer log message here  if we had a nice way to
244          * walk backpointers to print a path
245          */
246         bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
247 
248         u64 now = bch2_current_time(c);
249         struct btree_iter lostfound_iter = { NULL };
250         u64 cpu = raw_smp_processor_id();
251 
252         bch2_inode_init_early(c, lostfound);
253         bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
254         lostfound->bi_dir = root_inode.bi_inum;
255 
256         root_inode.bi_nlink++;
257 
258         ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
259         if (ret)
260                 goto err;
261 
262         bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
263         ret = bch2_btree_iter_traverse(&lostfound_iter);
264         if (ret)
265                 goto err;
266 
267         ret =   bch2_dirent_create_snapshot(trans,
268                                 0, root_inode.bi_inum, snapshot, &root_hash_info,
269                                 mode_to_type(lostfound->bi_mode),
270                                 &lostfound_str,
271                                 lostfound->bi_inum,
272                                 &lostfound->bi_dir_offset,
273                                 STR_HASH_must_create) ?:
274                 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
275                                        BTREE_UPDATE_internal_snapshot_node);
276 err:
277         bch_err_msg(c, ret, "creating lost+found");
278         bch2_trans_iter_exit(trans, &lostfound_iter);
279         return ret;
280 }
281 
282 static int reattach_inode(struct btree_trans *trans,
283                           struct bch_inode_unpacked *inode,
284                           u32 inode_snapshot)
285 {
286         struct bch_fs *c = trans->c;
287         struct bch_hash_info dir_hash;
288         struct bch_inode_unpacked lostfound;
289         char name_buf[20];
290         struct qstr name;
291         u64 dir_offset = 0;
292         u32 dirent_snapshot = inode_snapshot;
293         int ret;
294 
295         if (inode->bi_subvol) {
296                 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
297 
298                 u64 root_inum;
299                 ret = subvol_lookup(trans, inode->bi_parent_subvol,
300                                     &dirent_snapshot, &root_inum);
301                 if (ret)
302                         return ret;
303 
304                 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
305         } else {
306                 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
307         }
308 
309         ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
310         if (ret)
311                 return ret;
312 
313         if (S_ISDIR(inode->bi_mode)) {
314                 lostfound.bi_nlink++;
315 
316                 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
317                 if (ret)
318                         return ret;
319         }
320 
321         dir_hash = bch2_hash_info_init(c, &lostfound);
322 
323         name = (struct qstr) QSTR(name_buf);
324 
325         ret = bch2_dirent_create_snapshot(trans,
326                                 inode->bi_parent_subvol, lostfound.bi_inum,
327                                 dirent_snapshot,
328                                 &dir_hash,
329                                 inode_d_type(inode),
330                                 &name,
331                                 inode->bi_subvol ?: inode->bi_inum,
332                                 &dir_offset,
333                                 STR_HASH_must_create);
334         if (ret) {
335                 bch_err_msg(c, ret, "error creating dirent");
336                 return ret;
337         }
338 
339         inode->bi_dir           = lostfound.bi_inum;
340         inode->bi_dir_offset    = dir_offset;
341 
342         return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
343 }
344 
345 static int remove_backpointer(struct btree_trans *trans,
346                               struct bch_inode_unpacked *inode)
347 {
348         struct btree_iter iter;
349         struct bkey_s_c_dirent d;
350         int ret;
351 
352         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
353                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
354                                      dirent);
355         ret =   bkey_err(d) ?:
356                 __remove_dirent(trans, d.k->p);
357         bch2_trans_iter_exit(trans, &iter);
358         return ret;
359 }
360 
361 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
362 {
363         struct bch_fs *c = trans->c;
364 
365         struct bch_inode_unpacked inode;
366         int ret = bch2_inode_find_by_inum_trans(trans,
367                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
368                                 &inode);
369         if (ret)
370                 return ret;
371 
372         ret = remove_backpointer(trans, &inode);
373         bch_err_msg(c, ret, "removing dirent");
374         if (ret)
375                 return ret;
376 
377         ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
378         bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
379         return ret;
380 }
381 
382 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
383 {
384         struct bch_fs *c = trans->c;
385 
386         if (!bch2_snapshot_is_leaf(c, snapshotid)) {
387                 bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
388                 return -BCH_ERR_fsck_repair_unimplemented;
389         }
390 
391         /*
392          * If inum isn't set, that means we're being called from check_dirents,
393          * not check_inodes - the root of this subvolume doesn't exist or we
394          * would have found it there:
395          */
396         if (!inum) {
397                 struct btree_iter inode_iter = {};
398                 struct bch_inode_unpacked new_inode;
399                 u64 cpu = raw_smp_processor_id();
400 
401                 bch2_inode_init_early(c, &new_inode);
402                 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
403 
404                 new_inode.bi_subvol = subvolid;
405 
406                 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
407                           bch2_btree_iter_traverse(&inode_iter) ?:
408                           bch2_inode_write(trans, &inode_iter, &new_inode);
409                 bch2_trans_iter_exit(trans, &inode_iter);
410                 if (ret)
411                         return ret;
412 
413                 inum = new_inode.bi_inum;
414         }
415 
416         bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
417 
418         struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
419         int ret = PTR_ERR_OR_ZERO(new_subvol);
420         if (ret)
421                 return ret;
422 
423         bkey_subvolume_init(&new_subvol->k_i);
424         new_subvol->k.p.offset  = subvolid;
425         new_subvol->v.snapshot  = cpu_to_le32(snapshotid);
426         new_subvol->v.inode     = cpu_to_le64(inum);
427         ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
428         if (ret)
429                 return ret;
430 
431         struct btree_iter iter;
432         struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
433                         BTREE_ID_snapshots, POS(0, snapshotid),
434                         0, snapshot);
435         ret = PTR_ERR_OR_ZERO(s);
436         bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
437         if (ret)
438                 return ret;
439 
440         u32 snapshot_tree = le32_to_cpu(s->v.tree);
441 
442         s->v.subvol = cpu_to_le32(subvolid);
443         SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
444         bch2_trans_iter_exit(trans, &iter);
445 
446         struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
447                         BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
448                         0, snapshot_tree);
449         ret = PTR_ERR_OR_ZERO(st);
450         bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
451         if (ret)
452                 return ret;
453 
454         if (!st->v.master_subvol)
455                 st->v.master_subvol = cpu_to_le32(subvolid);
456 
457         bch2_trans_iter_exit(trans, &iter);
458         return 0;
459 }
460 
461 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
462 {
463         struct bch_fs *c = trans->c;
464         unsigned i_mode = S_IFREG;
465         u64 i_size = 0;
466 
467         switch (btree) {
468         case BTREE_ID_extents: {
469                 struct btree_iter iter = {};
470 
471                 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
472                 struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
473                 bch2_trans_iter_exit(trans, &iter);
474                 int ret = bkey_err(k);
475                 if (ret)
476                         return ret;
477 
478                 i_size = k.k->p.offset << 9;
479                 break;
480         }
481         case BTREE_ID_dirents:
482                 i_mode = S_IFDIR;
483                 break;
484         case BTREE_ID_xattrs:
485                 break;
486         default:
487                 BUG();
488         }
489 
490         struct bch_inode_unpacked new_inode;
491         bch2_inode_init_early(c, &new_inode);
492         bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
493         new_inode.bi_size = i_size;
494         new_inode.bi_inum = inum;
495 
496         return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
497 }
498 
499 struct snapshots_seen {
500         struct bpos                     pos;
501         snapshot_id_list                ids;
502 };
503 
504 static inline void snapshots_seen_exit(struct snapshots_seen *s)
505 {
506         darray_exit(&s->ids);
507 }
508 
509 static inline void snapshots_seen_init(struct snapshots_seen *s)
510 {
511         memset(s, 0, sizeof(*s));
512 }
513 
514 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
515 {
516         u32 *i;
517         __darray_for_each(s->ids, i) {
518                 if (*i == id)
519                         return 0;
520                 if (*i > id)
521                         break;
522         }
523 
524         int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
525         if (ret)
526                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
527                         s->ids.size);
528         return ret;
529 }
530 
531 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
532                                  enum btree_id btree_id, struct bpos pos)
533 {
534         if (!bkey_eq(s->pos, pos))
535                 s->ids.nr = 0;
536         s->pos = pos;
537 
538         return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
539 }
540 
541 /**
542  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
543  * and @ancestor hasn't been overwritten in @seen
544  *
545  * @c:          filesystem handle
546  * @seen:       list of snapshot ids already seen at current position
547  * @id:         descendent snapshot id
548  * @ancestor:   ancestor snapshot id
549  *
550  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
551  */
552 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
553                                     u32 id, u32 ancestor)
554 {
555         ssize_t i;
556 
557         EBUG_ON(id > ancestor);
558 
559         /* @ancestor should be the snapshot most recently added to @seen */
560         EBUG_ON(ancestor != seen->pos.snapshot);
561         EBUG_ON(ancestor != darray_last(seen->ids));
562 
563         if (id == ancestor)
564                 return true;
565 
566         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
567                 return false;
568 
569         /*
570          * We know that @id is a descendant of @ancestor, we're checking if
571          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
572          * @ascestor and with @id as a descendent.
573          *
574          * But we already know that we're scanning IDs between @id and @ancestor
575          * numerically, since snapshot ID lists are kept sorted, so if we find
576          * an id that's an ancestor of @id we're done:
577          */
578 
579         for (i = seen->ids.nr - 2;
580              i >= 0 && seen->ids.data[i] >= id;
581              --i)
582                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
583                         return false;
584 
585         return true;
586 }
587 
588 /**
589  * ref_visible - given a key with snapshot id @src that points to a key with
590  * snapshot id @dst, test whether there is some snapshot in which @dst is
591  * visible.
592  *
593  * @c:          filesystem handle
594  * @s:          list of snapshot IDs already seen at @src
595  * @src:        snapshot ID of src key
596  * @dst:        snapshot ID of dst key
597  * Returns:     true if there is some snapshot in which @dst is visible
598  *
599  * Assumes we're visiting @src keys in natural key order
600  */
601 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
602                         u32 src, u32 dst)
603 {
604         return dst <= src
605                 ? key_visible_in_snapshot(c, s, dst, src)
606                 : bch2_snapshot_is_ancestor(c, src, dst);
607 }
608 
609 static int ref_visible2(struct bch_fs *c,
610                         u32 src, struct snapshots_seen *src_seen,
611                         u32 dst, struct snapshots_seen *dst_seen)
612 {
613         if (dst > src) {
614                 swap(dst, src);
615                 swap(dst_seen, src_seen);
616         }
617         return key_visible_in_snapshot(c, src_seen, dst, src);
618 }
619 
620 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
621         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
622              (_i)->snapshot <= (_snapshot); _i++)                                       \
623                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
624 
625 struct inode_walker_entry {
626         struct bch_inode_unpacked inode;
627         u32                     snapshot;
628         bool                    seen_this_pos;
629         u64                     count;
630 };
631 
632 struct inode_walker {
633         bool                            first_this_inode;
634         bool                            recalculate_sums;
635         struct bpos                     last_pos;
636 
637         DARRAY(struct inode_walker_entry) inodes;
638 };
639 
640 static void inode_walker_exit(struct inode_walker *w)
641 {
642         darray_exit(&w->inodes);
643 }
644 
645 static struct inode_walker inode_walker_init(void)
646 {
647         return (struct inode_walker) { 0, };
648 }
649 
650 static int add_inode(struct bch_fs *c, struct inode_walker *w,
651                      struct bkey_s_c inode)
652 {
653         struct bch_inode_unpacked u;
654 
655         BUG_ON(bch2_inode_unpack(inode, &u));
656 
657         return darray_push(&w->inodes, ((struct inode_walker_entry) {
658                 .inode          = u,
659                 .snapshot       = inode.k->p.snapshot,
660         }));
661 }
662 
663 static int get_inodes_all_snapshots(struct btree_trans *trans,
664                                     struct inode_walker *w, u64 inum)
665 {
666         struct bch_fs *c = trans->c;
667         struct btree_iter iter;
668         struct bkey_s_c k;
669         int ret;
670 
671         w->recalculate_sums = false;
672         w->inodes.nr = 0;
673 
674         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
675                                      BTREE_ITER_all_snapshots, k, ret) {
676                 if (k.k->p.offset != inum)
677                         break;
678 
679                 if (bkey_is_inode(k.k))
680                         add_inode(c, w, k);
681         }
682         bch2_trans_iter_exit(trans, &iter);
683 
684         if (ret)
685                 return ret;
686 
687         w->first_this_inode = true;
688         return 0;
689 }
690 
691 static struct inode_walker_entry *
692 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
693 {
694         bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
695 
696         struct inode_walker_entry *i;
697         __darray_for_each(w->inodes, i)
698                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
699                         goto found;
700 
701         return NULL;
702 found:
703         BUG_ON(k.k->p.snapshot > i->snapshot);
704 
705         if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
706                 struct inode_walker_entry new = *i;
707 
708                 new.snapshot = k.k->p.snapshot;
709                 new.count = 0;
710 
711                 struct printbuf buf = PRINTBUF;
712                 bch2_bkey_val_to_text(&buf, c, k);
713 
714                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
715                          "unexpected because we should always update the inode when we update a key in that inode\n"
716                          "%s",
717                          w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
718                 printbuf_exit(&buf);
719 
720                 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
721                         --i;
722 
723                 size_t pos = i - w->inodes.data;
724                 int ret = darray_insert_item(&w->inodes, pos, new);
725                 if (ret)
726                         return ERR_PTR(ret);
727 
728                 i = w->inodes.data + pos;
729         }
730 
731         return i;
732 }
733 
734 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
735                                              struct inode_walker *w,
736                                              struct bkey_s_c k)
737 {
738         if (w->last_pos.inode != k.k->p.inode) {
739                 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
740                 if (ret)
741                         return ERR_PTR(ret);
742         } else if (bkey_cmp(w->last_pos, k.k->p)) {
743                 darray_for_each(w->inodes, i)
744                         i->seen_this_pos = false;
745         }
746 
747         w->last_pos = k.k->p;
748 
749         return lookup_inode_for_snapshot(trans->c, w, k);
750 }
751 
752 static int get_visible_inodes(struct btree_trans *trans,
753                               struct inode_walker *w,
754                               struct snapshots_seen *s,
755                               u64 inum)
756 {
757         struct bch_fs *c = trans->c;
758         struct btree_iter iter;
759         struct bkey_s_c k;
760         int ret;
761 
762         w->inodes.nr = 0;
763 
764         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
765                            BTREE_ITER_all_snapshots, k, ret) {
766                 if (k.k->p.offset != inum)
767                         break;
768 
769                 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
770                         continue;
771 
772                 if (bkey_is_inode(k.k))
773                         add_inode(c, w, k);
774 
775                 if (k.k->p.snapshot >= s->pos.snapshot)
776                         break;
777         }
778         bch2_trans_iter_exit(trans, &iter);
779 
780         return ret;
781 }
782 
783 static int hash_redo_key(struct btree_trans *trans,
784                          const struct bch_hash_desc desc,
785                          struct bch_hash_info *hash_info,
786                          struct btree_iter *k_iter, struct bkey_s_c k)
787 {
788         struct bkey_i *delete;
789         struct bkey_i *tmp;
790 
791         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
792         if (IS_ERR(delete))
793                 return PTR_ERR(delete);
794 
795         tmp = bch2_bkey_make_mut_noupdate(trans, k);
796         if (IS_ERR(tmp))
797                 return PTR_ERR(tmp);
798 
799         bkey_init(&delete->k);
800         delete->k.p = k_iter->pos;
801         return  bch2_btree_iter_traverse(k_iter) ?:
802                 bch2_trans_update(trans, k_iter, delete, 0) ?:
803                 bch2_hash_set_in_snapshot(trans, desc, hash_info,
804                                        (subvol_inum) { 0, k.k->p.inode },
805                                        k.k->p.snapshot, tmp,
806                                        STR_HASH_must_create|
807                                        BTREE_UPDATE_internal_snapshot_node) ?:
808                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
809 }
810 
811 static int hash_check_key(struct btree_trans *trans,
812                           const struct bch_hash_desc desc,
813                           struct bch_hash_info *hash_info,
814                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
815 {
816         struct bch_fs *c = trans->c;
817         struct btree_iter iter = { NULL };
818         struct printbuf buf = PRINTBUF;
819         struct bkey_s_c k;
820         u64 hash;
821         int ret = 0;
822 
823         if (hash_k.k->type != desc.key_type)
824                 return 0;
825 
826         hash = desc.hash_bkey(hash_info, hash_k);
827 
828         if (likely(hash == hash_k.k->p.offset))
829                 return 0;
830 
831         if (hash_k.k->p.offset < hash)
832                 goto bad_hash;
833 
834         for_each_btree_key_norestart(trans, iter, desc.btree_id,
835                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
836                                      BTREE_ITER_slots, k, ret) {
837                 if (bkey_eq(k.k->p, hash_k.k->p))
838                         break;
839 
840                 if (fsck_err_on(k.k->type == desc.key_type &&
841                                 !desc.cmp_bkey(k, hash_k),
842                                 trans, hash_table_key_duplicate,
843                                 "duplicate hash table keys:\n%s",
844                                 (printbuf_reset(&buf),
845                                  bch2_bkey_val_to_text(&buf, c, hash_k),
846                                  buf.buf))) {
847                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
848                         break;
849                 }
850 
851                 if (bkey_deleted(k.k)) {
852                         bch2_trans_iter_exit(trans, &iter);
853                         goto bad_hash;
854                 }
855         }
856 out:
857         bch2_trans_iter_exit(trans, &iter);
858         printbuf_exit(&buf);
859         return ret;
860 bad_hash:
861         if (fsck_err(trans, hash_table_key_wrong_offset,
862                      "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
863                      bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
864                      (printbuf_reset(&buf),
865                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
866                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
867                 bch_err_fn(c, ret);
868                 if (ret)
869                         return ret;
870                 ret = -BCH_ERR_transaction_restart_nested;
871         }
872 fsck_err:
873         goto out;
874 }
875 
876 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
877                                                 struct btree_iter *iter,
878                                                 struct bpos pos)
879 {
880         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
881 }
882 
883 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
884                                                struct btree_iter *iter,
885                                                struct bch_inode_unpacked *inode,
886                                                u32 *snapshot)
887 {
888         if (inode->bi_subvol) {
889                 u64 inum;
890                 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
891                 if (ret)
892                         return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
893         }
894 
895         return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
896 }
897 
898 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
899                                    struct bkey_s_c_dirent d)
900 {
901         return  inode->bi_dir           == d.k->p.inode &&
902                 inode->bi_dir_offset    == d.k->p.offset;
903 }
904 
905 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
906                                    struct bch_inode_unpacked *inode)
907 {
908         return d.v->d_type == DT_SUBVOL
909                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
910                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
911 }
912 
913 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
914 {
915         struct btree_iter iter;
916         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
917         int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
918         bch2_trans_iter_exit(trans, &iter);
919         return ret;
920 }
921 
922 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
923                                     struct bch_inode_unpacked *inode,
924                                     u32 inode_snapshot, bool *write_inode)
925 {
926         struct bch_fs *c = trans->c;
927         struct printbuf buf = PRINTBUF;
928 
929         struct btree_iter dirent_iter = {};
930         struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
931         int ret = bkey_err(d);
932         if (ret && !bch2_err_matches(ret, ENOENT))
933                 return ret;
934 
935         if (fsck_err_on(ret,
936                         trans, inode_points_to_missing_dirent,
937                         "inode points to missing dirent\n%s",
938                         (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
939             fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
940                         trans, inode_points_to_wrong_dirent,
941                         "inode points to dirent that does not point back:\n%s",
942                         (bch2_bkey_val_to_text(&buf, c, inode_k),
943                          prt_newline(&buf),
944                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
945                 /*
946                  * We just clear the backpointer fields for now. If we find a
947                  * dirent that points to this inode in check_dirents(), we'll
948                  * update it then; then when we get to check_path() if the
949                  * backpointer is still 0 we'll reattach it.
950                  */
951                 inode->bi_dir = 0;
952                 inode->bi_dir_offset = 0;
953                 inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
954                 *write_inode = true;
955         }
956 
957         ret = 0;
958 fsck_err:
959         bch2_trans_iter_exit(trans, &dirent_iter);
960         printbuf_exit(&buf);
961         bch_err_fn(c, ret);
962         return ret;
963 }
964 
965 static int check_inode(struct btree_trans *trans,
966                        struct btree_iter *iter,
967                        struct bkey_s_c k,
968                        struct bch_inode_unpacked *prev,
969                        struct snapshots_seen *s,
970                        bool full)
971 {
972         struct bch_fs *c = trans->c;
973         struct bch_inode_unpacked u;
974         bool do_update = false;
975         int ret;
976 
977         ret = bch2_check_key_has_snapshot(trans, iter, k);
978         if (ret < 0)
979                 goto err;
980         if (ret)
981                 return 0;
982 
983         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
984         if (ret)
985                 goto err;
986 
987         if (!bkey_is_inode(k.k))
988                 return 0;
989 
990         BUG_ON(bch2_inode_unpack(k, &u));
991 
992         if (!full &&
993             !(u.bi_flags & (BCH_INODE_i_size_dirty|
994                             BCH_INODE_i_sectors_dirty|
995                             BCH_INODE_unlinked)))
996                 return 0;
997 
998         if (prev->bi_inum != u.bi_inum)
999                 *prev = u;
1000 
1001         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
1002                         inode_d_type(prev)      != inode_d_type(&u),
1003                         trans, inode_snapshot_mismatch,
1004                         "inodes in different snapshots don't match")) {
1005                 bch_err(c, "repair not implemented yet");
1006                 return -BCH_ERR_fsck_repair_unimplemented;
1007         }
1008 
1009         if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1010             bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1011                 struct bpos new_min_pos;
1012 
1013                 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1014                 if (ret)
1015                         goto err;
1016 
1017                 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1018 
1019                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1020 
1021                 bch_err_msg(c, ret, "in fsck updating inode");
1022                 if (ret)
1023                         return ret;
1024 
1025                 if (!bpos_eq(new_min_pos, POS_MIN))
1026                         bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1027                 return 0;
1028         }
1029 
1030         if (u.bi_flags & BCH_INODE_unlinked) {
1031                 ret = check_inode_deleted_list(trans, k.k->p);
1032                 if (ret < 0)
1033                         return ret;
1034 
1035                 fsck_err_on(!ret,
1036                             trans, unlinked_inode_not_on_deleted_list,
1037                             "inode %llu:%u unlinked, but not on deleted list",
1038                             u.bi_inum, k.k->p.snapshot);
1039                 ret = 0;
1040         }
1041 
1042         if (u.bi_flags & BCH_INODE_unlinked &&
1043             (!c->sb.clean ||
1044              fsck_err(trans, inode_unlinked_but_clean,
1045                       "filesystem marked clean, but inode %llu unlinked",
1046                       u.bi_inum))) {
1047                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1048                 bch_err_msg(c, ret, "in fsck deleting inode");
1049                 return ret;
1050         }
1051 
1052         if (u.bi_flags & BCH_INODE_i_size_dirty &&
1053             (!c->sb.clean ||
1054              fsck_err(trans, inode_i_size_dirty_but_clean,
1055                       "filesystem marked clean, but inode %llu has i_size dirty",
1056                       u.bi_inum))) {
1057                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
1058 
1059                 /*
1060                  * XXX: need to truncate partial blocks too here - or ideally
1061                  * just switch units to bytes and that issue goes away
1062                  */
1063                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1064                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1065                                      iter->pos.snapshot),
1066                                 POS(u.bi_inum, U64_MAX),
1067                                 0, NULL);
1068                 bch_err_msg(c, ret, "in fsck truncating inode");
1069                 if (ret)
1070                         return ret;
1071 
1072                 /*
1073                  * We truncated without our normal sector accounting hook, just
1074                  * make sure we recalculate it:
1075                  */
1076                 u.bi_flags |= BCH_INODE_i_sectors_dirty;
1077 
1078                 u.bi_flags &= ~BCH_INODE_i_size_dirty;
1079                 do_update = true;
1080         }
1081 
1082         if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1083             (!c->sb.clean ||
1084              fsck_err(trans, inode_i_sectors_dirty_but_clean,
1085                       "filesystem marked clean, but inode %llu has i_sectors dirty",
1086                       u.bi_inum))) {
1087                 s64 sectors;
1088 
1089                 bch_verbose(c, "recounting sectors for inode %llu",
1090                             u.bi_inum);
1091 
1092                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1093                 if (sectors < 0) {
1094                         bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1095                         return sectors;
1096                 }
1097 
1098                 u.bi_sectors = sectors;
1099                 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1100                 do_update = true;
1101         }
1102 
1103         if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1104                 u.bi_dir = 0;
1105                 u.bi_dir_offset = 0;
1106                 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1107                 do_update = true;
1108         }
1109 
1110         if (u.bi_dir || u.bi_dir_offset) {
1111                 ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1112                 if (ret)
1113                         goto err;
1114         }
1115 
1116         if (fsck_err_on(u.bi_parent_subvol &&
1117                         (u.bi_subvol == 0 ||
1118                          u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1119                         trans, inode_bi_parent_nonzero,
1120                         "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1121                         u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1122                 u.bi_parent_subvol = 0;
1123                 do_update = true;
1124         }
1125 
1126         if (u.bi_subvol) {
1127                 struct bch_subvolume s;
1128 
1129                 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1130                 if (ret && !bch2_err_matches(ret, ENOENT))
1131                         goto err;
1132 
1133                 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1134                         ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1135                         goto do_update;
1136                 }
1137 
1138                 if (fsck_err_on(ret,
1139                                 trans, inode_bi_subvol_missing,
1140                                 "inode %llu:%u bi_subvol points to missing subvolume %u",
1141                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1142                     fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1143                                 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1144                                                            k.k->p.snapshot),
1145                                 trans, inode_bi_subvol_wrong,
1146                                 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1147                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1148                                 le64_to_cpu(s.inode),
1149                                 le32_to_cpu(s.snapshot))) {
1150                         u.bi_subvol = 0;
1151                         u.bi_parent_subvol = 0;
1152                         do_update = true;
1153                 }
1154         }
1155 do_update:
1156         if (do_update) {
1157                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1158                 bch_err_msg(c, ret, "in fsck updating inode");
1159                 if (ret)
1160                         return ret;
1161         }
1162 err:
1163 fsck_err:
1164         bch_err_fn(c, ret);
1165         return ret;
1166 }
1167 
1168 int bch2_check_inodes(struct bch_fs *c)
1169 {
1170         bool full = c->opts.fsck;
1171         struct bch_inode_unpacked prev = { 0 };
1172         struct snapshots_seen s;
1173 
1174         snapshots_seen_init(&s);
1175 
1176         int ret = bch2_trans_run(c,
1177                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1178                                 POS_MIN,
1179                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1180                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1181                         check_inode(trans, &iter, k, &prev, &s, full)));
1182 
1183         snapshots_seen_exit(&s);
1184         bch_err_fn(c, ret);
1185         return ret;
1186 }
1187 
1188 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1189 {
1190         switch (btree) {
1191         case BTREE_ID_extents:
1192                 return S_ISREG(mode) || S_ISLNK(mode);
1193         case BTREE_ID_dirents:
1194                 return S_ISDIR(mode);
1195         case BTREE_ID_xattrs:
1196                 return true;
1197         default:
1198                 BUG();
1199         }
1200 }
1201 
1202 static int check_key_has_inode(struct btree_trans *trans,
1203                                struct btree_iter *iter,
1204                                struct inode_walker *inode,
1205                                struct inode_walker_entry *i,
1206                                struct bkey_s_c k)
1207 {
1208         struct bch_fs *c = trans->c;
1209         struct printbuf buf = PRINTBUF;
1210         int ret = PTR_ERR_OR_ZERO(i);
1211         if (ret)
1212                 return ret;
1213 
1214         if (k.k->type == KEY_TYPE_whiteout)
1215                 goto out;
1216 
1217         if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1218                 ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1219                         bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1220                 if (ret)
1221                         goto err;
1222 
1223                 inode->last_pos.inode--;
1224                 ret = -BCH_ERR_transaction_restart_nested;
1225                 goto err;
1226         }
1227 
1228         if (fsck_err_on(!i,
1229                         trans, key_in_missing_inode,
1230                         "key in missing inode:\n  %s",
1231                         (printbuf_reset(&buf),
1232                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1233                 goto delete;
1234 
1235         if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1236                         trans, key_in_wrong_inode_type,
1237                         "key for wrong inode mode %o:\n  %s",
1238                         i->inode.bi_mode,
1239                         (printbuf_reset(&buf),
1240                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1241                 goto delete;
1242 out:
1243 err:
1244 fsck_err:
1245         printbuf_exit(&buf);
1246         bch_err_fn(c, ret);
1247         return ret;
1248 delete:
1249         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1250         goto out;
1251 }
1252 
1253 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1254 {
1255         struct bch_fs *c = trans->c;
1256         int ret = 0;
1257         s64 count2;
1258 
1259         darray_for_each(w->inodes, i) {
1260                 if (i->inode.bi_sectors == i->count)
1261                         continue;
1262 
1263                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1264 
1265                 if (w->recalculate_sums)
1266                         i->count = count2;
1267 
1268                 if (i->count != count2) {
1269                         bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1270                                             w->last_pos.inode, i->snapshot, i->count, count2);
1271                         return -BCH_ERR_internal_fsck_err;
1272                 }
1273 
1274                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1275                                 trans, inode_i_sectors_wrong,
1276                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1277                                 w->last_pos.inode, i->snapshot,
1278                                 i->inode.bi_sectors, i->count)) {
1279                         i->inode.bi_sectors = i->count;
1280                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1281                         if (ret)
1282                                 break;
1283                 }
1284         }
1285 fsck_err:
1286         bch_err_fn(c, ret);
1287         return ret;
1288 }
1289 
1290 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1291 {
1292         u32 restart_count = trans->restart_count;
1293         return check_i_sectors_notnested(trans, w) ?:
1294                 trans_was_restarted(trans, restart_count);
1295 }
1296 
1297 struct extent_end {
1298         u32                     snapshot;
1299         u64                     offset;
1300         struct snapshots_seen   seen;
1301 };
1302 
1303 struct extent_ends {
1304         struct bpos                     last_pos;
1305         DARRAY(struct extent_end)       e;
1306 };
1307 
1308 static void extent_ends_reset(struct extent_ends *extent_ends)
1309 {
1310         darray_for_each(extent_ends->e, i)
1311                 snapshots_seen_exit(&i->seen);
1312         extent_ends->e.nr = 0;
1313 }
1314 
1315 static void extent_ends_exit(struct extent_ends *extent_ends)
1316 {
1317         extent_ends_reset(extent_ends);
1318         darray_exit(&extent_ends->e);
1319 }
1320 
1321 static void extent_ends_init(struct extent_ends *extent_ends)
1322 {
1323         memset(extent_ends, 0, sizeof(*extent_ends));
1324 }
1325 
1326 static int extent_ends_at(struct bch_fs *c,
1327                           struct extent_ends *extent_ends,
1328                           struct snapshots_seen *seen,
1329                           struct bkey_s_c k)
1330 {
1331         struct extent_end *i, n = (struct extent_end) {
1332                 .offset         = k.k->p.offset,
1333                 .snapshot       = k.k->p.snapshot,
1334                 .seen           = *seen,
1335         };
1336 
1337         n.seen.ids.data = kmemdup(seen->ids.data,
1338                               sizeof(seen->ids.data[0]) * seen->ids.size,
1339                               GFP_KERNEL);
1340         if (!n.seen.ids.data)
1341                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1342 
1343         __darray_for_each(extent_ends->e, i) {
1344                 if (i->snapshot == k.k->p.snapshot) {
1345                         snapshots_seen_exit(&i->seen);
1346                         *i = n;
1347                         return 0;
1348                 }
1349 
1350                 if (i->snapshot >= k.k->p.snapshot)
1351                         break;
1352         }
1353 
1354         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1355 }
1356 
1357 static int overlapping_extents_found(struct btree_trans *trans,
1358                                      enum btree_id btree,
1359                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1360                                      struct bkey pos2,
1361                                      bool *fixed,
1362                                      struct extent_end *extent_end)
1363 {
1364         struct bch_fs *c = trans->c;
1365         struct printbuf buf = PRINTBUF;
1366         struct btree_iter iter1, iter2 = { NULL };
1367         struct bkey_s_c k1, k2;
1368         int ret;
1369 
1370         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1371 
1372         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1373                              BTREE_ITER_all_snapshots|
1374                              BTREE_ITER_not_extents);
1375         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1376         ret = bkey_err(k1);
1377         if (ret)
1378                 goto err;
1379 
1380         prt_str(&buf, "\n  ");
1381         bch2_bkey_val_to_text(&buf, c, k1);
1382 
1383         if (!bpos_eq(pos1, k1.k->p)) {
1384                 prt_str(&buf, "\n  wanted\n  ");
1385                 bch2_bpos_to_text(&buf, pos1);
1386                 prt_str(&buf, "\n  ");
1387                 bch2_bkey_to_text(&buf, &pos2);
1388 
1389                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1390                         __func__, buf.buf);
1391                 ret = -BCH_ERR_internal_fsck_err;
1392                 goto err;
1393         }
1394 
1395         bch2_trans_copy_iter(&iter2, &iter1);
1396 
1397         while (1) {
1398                 bch2_btree_iter_advance(&iter2);
1399 
1400                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1401                 ret = bkey_err(k2);
1402                 if (ret)
1403                         goto err;
1404 
1405                 if (bpos_ge(k2.k->p, pos2.p))
1406                         break;
1407         }
1408 
1409         prt_str(&buf, "\n  ");
1410         bch2_bkey_val_to_text(&buf, c, k2);
1411 
1412         if (bpos_gt(k2.k->p, pos2.p) ||
1413             pos2.size != k2.k->size) {
1414                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1415                         __func__, buf.buf);
1416                 ret = -BCH_ERR_internal_fsck_err;
1417                 goto err;
1418         }
1419 
1420         prt_printf(&buf, "\n  overwriting %s extent",
1421                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1422 
1423         if (fsck_err(trans, extent_overlapping,
1424                      "overlapping extents%s", buf.buf)) {
1425                 struct btree_iter *old_iter = &iter1;
1426                 struct disk_reservation res = { 0 };
1427 
1428                 if (pos1.snapshot < pos2.p.snapshot) {
1429                         old_iter = &iter2;
1430                         swap(k1, k2);
1431                 }
1432 
1433                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1434 
1435                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1436                                 BTREE_UPDATE_internal_snapshot_node,
1437                                 k1, k2) ?:
1438                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1439                 bch2_disk_reservation_put(c, &res);
1440 
1441                 if (ret)
1442                         goto err;
1443 
1444                 *fixed = true;
1445 
1446                 if (pos1.snapshot == pos2.p.snapshot) {
1447                         /*
1448                          * We overwrote the first extent, and did the overwrite
1449                          * in the same snapshot:
1450                          */
1451                         extent_end->offset = bkey_start_offset(&pos2);
1452                 } else if (pos1.snapshot > pos2.p.snapshot) {
1453                         /*
1454                          * We overwrote the first extent in pos2's snapshot:
1455                          */
1456                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1457                 } else {
1458                         /*
1459                          * We overwrote the second extent - restart
1460                          * check_extent() from the top:
1461                          */
1462                         ret = -BCH_ERR_transaction_restart_nested;
1463                 }
1464         }
1465 fsck_err:
1466 err:
1467         bch2_trans_iter_exit(trans, &iter2);
1468         bch2_trans_iter_exit(trans, &iter1);
1469         printbuf_exit(&buf);
1470         return ret;
1471 }
1472 
1473 static int check_overlapping_extents(struct btree_trans *trans,
1474                               struct snapshots_seen *seen,
1475                               struct extent_ends *extent_ends,
1476                               struct bkey_s_c k,
1477                               struct btree_iter *iter,
1478                               bool *fixed)
1479 {
1480         struct bch_fs *c = trans->c;
1481         int ret = 0;
1482 
1483         /* transaction restart, running again */
1484         if (bpos_eq(extent_ends->last_pos, k.k->p))
1485                 return 0;
1486 
1487         if (extent_ends->last_pos.inode != k.k->p.inode)
1488                 extent_ends_reset(extent_ends);
1489 
1490         darray_for_each(extent_ends->e, i) {
1491                 if (i->offset <= bkey_start_offset(k.k))
1492                         continue;
1493 
1494                 if (!ref_visible2(c,
1495                                   k.k->p.snapshot, seen,
1496                                   i->snapshot, &i->seen))
1497                         continue;
1498 
1499                 ret = overlapping_extents_found(trans, iter->btree_id,
1500                                                 SPOS(iter->pos.inode,
1501                                                      i->offset,
1502                                                      i->snapshot),
1503                                                 &i->seen,
1504                                                 *k.k, fixed, i);
1505                 if (ret)
1506                         goto err;
1507         }
1508 
1509         extent_ends->last_pos = k.k->p;
1510 err:
1511         return ret;
1512 }
1513 
1514 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1515                                 struct bkey_s_c k)
1516 {
1517         struct bch_fs *c = trans->c;
1518         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1519         struct bch_extent_crc_unpacked crc;
1520         const union bch_extent_entry *i;
1521         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1522 
1523         bkey_for_each_crc(k.k, ptrs, crc, i)
1524                 if (crc_is_encoded(crc) &&
1525                     crc.uncompressed_size > encoded_extent_max_sectors) {
1526                         struct printbuf buf = PRINTBUF;
1527 
1528                         bch2_bkey_val_to_text(&buf, c, k);
1529                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1530                         printbuf_exit(&buf);
1531                 }
1532 
1533         return 0;
1534 }
1535 
1536 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1537                         struct bkey_s_c k,
1538                         struct inode_walker *inode,
1539                         struct snapshots_seen *s,
1540                         struct extent_ends *extent_ends)
1541 {
1542         struct bch_fs *c = trans->c;
1543         struct inode_walker_entry *i;
1544         struct printbuf buf = PRINTBUF;
1545         int ret = 0;
1546 
1547         ret = bch2_check_key_has_snapshot(trans, iter, k);
1548         if (ret) {
1549                 ret = ret < 0 ? ret : 0;
1550                 goto out;
1551         }
1552 
1553         if (inode->last_pos.inode != k.k->p.inode) {
1554                 ret = check_i_sectors(trans, inode);
1555                 if (ret)
1556                         goto err;
1557         }
1558 
1559         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1560         if (ret)
1561                 goto err;
1562 
1563         i = walk_inode(trans, inode, k);
1564         ret = PTR_ERR_OR_ZERO(i);
1565         if (ret)
1566                 goto err;
1567 
1568         ret = check_key_has_inode(trans, iter, inode, i, k);
1569         if (ret)
1570                 goto err;
1571 
1572         if (k.k->type != KEY_TYPE_whiteout) {
1573                 ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1574                                                 &inode->recalculate_sums);
1575                 if (ret)
1576                         goto err;
1577         }
1578 
1579         /*
1580          * Check inodes in reverse order, from oldest snapshots to newest,
1581          * starting from the inode that matches this extent's snapshot. If we
1582          * didn't have one, iterate over all inodes:
1583          */
1584         if (!i)
1585                 i = &darray_last(inode->inodes);
1586 
1587         for (;
1588              inode->inodes.data && i >= inode->inodes.data;
1589              --i) {
1590                 if (i->snapshot > k.k->p.snapshot ||
1591                     !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1592                         continue;
1593 
1594                 if (k.k->type != KEY_TYPE_whiteout) {
1595                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1596                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1597                                         !bkey_extent_is_reservation(k),
1598                                         trans, extent_past_end_of_inode,
1599                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1600                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1601                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1602                                 struct btree_iter iter2;
1603 
1604                                 bch2_trans_copy_iter(&iter2, iter);
1605                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1606                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1607                                         bch2_btree_delete_at(trans, &iter2,
1608                                                 BTREE_UPDATE_internal_snapshot_node);
1609                                 bch2_trans_iter_exit(trans, &iter2);
1610                                 if (ret)
1611                                         goto err;
1612 
1613                                 iter->k.type = KEY_TYPE_whiteout;
1614                         }
1615 
1616                         if (bkey_extent_is_allocation(k.k))
1617                                 i->count += k.k->size;
1618                 }
1619 
1620                 i->seen_this_pos = true;
1621         }
1622 
1623         if (k.k->type != KEY_TYPE_whiteout) {
1624                 ret = extent_ends_at(c, extent_ends, s, k);
1625                 if (ret)
1626                         goto err;
1627         }
1628 out:
1629 err:
1630 fsck_err:
1631         printbuf_exit(&buf);
1632         bch_err_fn(c, ret);
1633         return ret;
1634 }
1635 
1636 /*
1637  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1638  * that i_size an i_sectors are consistent
1639  */
1640 int bch2_check_extents(struct bch_fs *c)
1641 {
1642         struct inode_walker w = inode_walker_init();
1643         struct snapshots_seen s;
1644         struct extent_ends extent_ends;
1645         struct disk_reservation res = { 0 };
1646 
1647         snapshots_seen_init(&s);
1648         extent_ends_init(&extent_ends);
1649 
1650         int ret = bch2_trans_run(c,
1651                 for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1652                                 POS(BCACHEFS_ROOT_INO, 0),
1653                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1654                                 &res, NULL,
1655                                 BCH_TRANS_COMMIT_no_enospc, ({
1656                         bch2_disk_reservation_put(c, &res);
1657                         check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1658                         check_extent_overbig(trans, &iter, k);
1659                 })) ?:
1660                 check_i_sectors_notnested(trans, &w));
1661 
1662         bch2_disk_reservation_put(c, &res);
1663         extent_ends_exit(&extent_ends);
1664         inode_walker_exit(&w);
1665         snapshots_seen_exit(&s);
1666 
1667         bch_err_fn(c, ret);
1668         return ret;
1669 }
1670 
1671 int bch2_check_indirect_extents(struct bch_fs *c)
1672 {
1673         struct disk_reservation res = { 0 };
1674 
1675         int ret = bch2_trans_run(c,
1676                 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1677                                 POS_MIN,
1678                                 BTREE_ITER_prefetch, k,
1679                                 &res, NULL,
1680                                 BCH_TRANS_COMMIT_no_enospc, ({
1681                         bch2_disk_reservation_put(c, &res);
1682                         check_extent_overbig(trans, &iter, k);
1683                 })));
1684 
1685         bch2_disk_reservation_put(c, &res);
1686         bch_err_fn(c, ret);
1687         return ret;
1688 }
1689 
1690 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1691 {
1692         struct bch_fs *c = trans->c;
1693         int ret = 0;
1694         s64 count2;
1695 
1696         darray_for_each(w->inodes, i) {
1697                 if (i->inode.bi_nlink == i->count)
1698                         continue;
1699 
1700                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1701                 if (count2 < 0)
1702                         return count2;
1703 
1704                 if (i->count != count2) {
1705                         bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1706                                             w->last_pos.inode, i->snapshot, i->count, count2);
1707                         i->count = count2;
1708                         if (i->inode.bi_nlink == i->count)
1709                                 continue;
1710                 }
1711 
1712                 if (fsck_err_on(i->inode.bi_nlink != i->count,
1713                                 trans, inode_dir_wrong_nlink,
1714                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1715                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1716                         i->inode.bi_nlink = i->count;
1717                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1718                         if (ret)
1719                                 break;
1720                 }
1721         }
1722 fsck_err:
1723         bch_err_fn(c, ret);
1724         return ret;
1725 }
1726 
1727 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1728 {
1729         u32 restart_count = trans->restart_count;
1730         return check_subdir_count_notnested(trans, w) ?:
1731                 trans_was_restarted(trans, restart_count);
1732 }
1733 
1734 noinline_for_stack
1735 static int check_dirent_inode_dirent(struct btree_trans *trans,
1736                                    struct btree_iter *iter,
1737                                    struct bkey_s_c_dirent d,
1738                                    struct bch_inode_unpacked *target,
1739                                    u32 target_snapshot)
1740 {
1741         struct bch_fs *c = trans->c;
1742         struct printbuf buf = PRINTBUF;
1743         int ret = 0;
1744 
1745         if (inode_points_to_dirent(target, d))
1746                 return 0;
1747 
1748         if (bch2_inode_should_have_bp(target) &&
1749             !fsck_err(trans, inode_wrong_backpointer,
1750                       "dirent points to inode that does not point back:\n  %s",
1751                       (bch2_bkey_val_to_text(&buf, c, d.s_c),
1752                        prt_printf(&buf, "\n  "),
1753                        bch2_inode_unpacked_to_text(&buf, target),
1754                        buf.buf)))
1755                 goto out_noiter;
1756 
1757         if (!target->bi_dir &&
1758             !target->bi_dir_offset) {
1759                 target->bi_dir          = d.k->p.inode;
1760                 target->bi_dir_offset   = d.k->p.offset;
1761                 return __bch2_fsck_write_inode(trans, target, target_snapshot);
1762         }
1763 
1764         struct btree_iter bp_iter = { NULL };
1765         struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1766                               SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1767         ret = bkey_err(bp_dirent);
1768         if (ret && !bch2_err_matches(ret, ENOENT))
1769                 goto err;
1770 
1771         bool backpointer_exists = !ret;
1772         ret = 0;
1773 
1774         if (fsck_err_on(!backpointer_exists,
1775                         trans, inode_wrong_backpointer,
1776                         "inode %llu:%u has wrong backpointer:\n"
1777                         "got       %llu:%llu\n"
1778                         "should be %llu:%llu",
1779                         target->bi_inum, target_snapshot,
1780                         target->bi_dir,
1781                         target->bi_dir_offset,
1782                         d.k->p.inode,
1783                         d.k->p.offset)) {
1784                 target->bi_dir          = d.k->p.inode;
1785                 target->bi_dir_offset   = d.k->p.offset;
1786                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1787                 goto out;
1788         }
1789 
1790         bch2_bkey_val_to_text(&buf, c, d.s_c);
1791         prt_newline(&buf);
1792         if (backpointer_exists)
1793                 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1794 
1795         if (fsck_err_on(backpointer_exists &&
1796                         (S_ISDIR(target->bi_mode) ||
1797                          target->bi_subvol),
1798                         trans, inode_dir_multiple_links,
1799                         "%s %llu:%u with multiple links\n%s",
1800                         S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1801                         target->bi_inum, target_snapshot, buf.buf)) {
1802                 ret = __remove_dirent(trans, d.k->p);
1803                 goto out;
1804         }
1805 
1806         /*
1807          * hardlinked file with nlink 0:
1808          * We're just adjusting nlink here so check_nlinks() will pick
1809          * it up, it ignores inodes with nlink 0
1810          */
1811         if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1812                         trans, inode_multiple_links_but_nlink_0,
1813                         "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1814                         target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1815                 target->bi_nlink++;
1816                 target->bi_flags &= ~BCH_INODE_unlinked;
1817                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1818                 if (ret)
1819                         goto err;
1820         }
1821 out:
1822 err:
1823 fsck_err:
1824         bch2_trans_iter_exit(trans, &bp_iter);
1825 out_noiter:
1826         printbuf_exit(&buf);
1827         bch_err_fn(c, ret);
1828         return ret;
1829 }
1830 
1831 noinline_for_stack
1832 static int check_dirent_target(struct btree_trans *trans,
1833                                struct btree_iter *iter,
1834                                struct bkey_s_c_dirent d,
1835                                struct bch_inode_unpacked *target,
1836                                u32 target_snapshot)
1837 {
1838         struct bch_fs *c = trans->c;
1839         struct bkey_i_dirent *n;
1840         struct printbuf buf = PRINTBUF;
1841         int ret = 0;
1842 
1843         ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1844         if (ret)
1845                 goto err;
1846 
1847         if (fsck_err_on(d.v->d_type != inode_d_type(target),
1848                         trans, dirent_d_type_wrong,
1849                         "incorrect d_type: got %s, should be %s:\n%s",
1850                         bch2_d_type_str(d.v->d_type),
1851                         bch2_d_type_str(inode_d_type(target)),
1852                         (printbuf_reset(&buf),
1853                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1854                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1855                 ret = PTR_ERR_OR_ZERO(n);
1856                 if (ret)
1857                         goto err;
1858 
1859                 bkey_reassemble(&n->k_i, d.s_c);
1860                 n->v.d_type = inode_d_type(target);
1861                 if (n->v.d_type == DT_SUBVOL) {
1862                         n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1863                         n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1864                 } else {
1865                         n->v.d_inum = cpu_to_le64(target->bi_inum);
1866                 }
1867 
1868                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1869                 if (ret)
1870                         goto err;
1871 
1872                 d = dirent_i_to_s_c(n);
1873         }
1874 err:
1875 fsck_err:
1876         printbuf_exit(&buf);
1877         bch_err_fn(c, ret);
1878         return ret;
1879 }
1880 
1881 /* find a subvolume that's a descendent of @snapshot: */
1882 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1883 {
1884         struct btree_iter iter;
1885         struct bkey_s_c k;
1886         int ret;
1887 
1888         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1889                 if (k.k->type != KEY_TYPE_subvolume)
1890                         continue;
1891 
1892                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1893                 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1894                         bch2_trans_iter_exit(trans, &iter);
1895                         *subvolid = k.k->p.offset;
1896                         goto found;
1897                 }
1898         }
1899         if (!ret)
1900                 ret = -ENOENT;
1901 found:
1902         bch2_trans_iter_exit(trans, &iter);
1903         return ret;
1904 }
1905 
1906 noinline_for_stack
1907 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1908                                   struct bkey_s_c_dirent d)
1909 {
1910         struct bch_fs *c = trans->c;
1911         struct btree_iter subvol_iter = {};
1912         struct bch_inode_unpacked subvol_root;
1913         u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1914         u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1915         u32 parent_snapshot;
1916         u32 new_parent_subvol = 0;
1917         u64 parent_inum;
1918         struct printbuf buf = PRINTBUF;
1919         int ret = 0;
1920 
1921         ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1922         if (ret && !bch2_err_matches(ret, ENOENT))
1923                 return ret;
1924 
1925         if (ret ||
1926             (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1927                 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1928                 if (ret2 && !bch2_err_matches(ret, ENOENT))
1929                         return ret2;
1930         }
1931 
1932         if (ret &&
1933             !new_parent_subvol &&
1934             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1935                 /*
1936                  * Couldn't find a subvol for dirent's snapshot - but we lost
1937                  * subvols, so we need to reconstruct:
1938                  */
1939                 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1940                 if (ret)
1941                         return ret;
1942 
1943                 parent_snapshot = d.k->p.snapshot;
1944         }
1945 
1946         if (fsck_err_on(ret,
1947                         trans, dirent_to_missing_parent_subvol,
1948                         "dirent parent_subvol points to missing subvolume\n%s",
1949                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1950             fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1951                         trans, dirent_not_visible_in_parent_subvol,
1952                         "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1953                         parent_snapshot,
1954                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1955                 if (!new_parent_subvol) {
1956                         bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1957                         return -BCH_ERR_fsck_repair_unimplemented;
1958                 }
1959 
1960                 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1961                 ret = PTR_ERR_OR_ZERO(new_dirent);
1962                 if (ret)
1963                         goto err;
1964 
1965                 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1966         }
1967 
1968         struct bkey_s_c_subvolume s =
1969                 bch2_bkey_get_iter_typed(trans, &subvol_iter,
1970                                          BTREE_ID_subvolumes, POS(0, target_subvol),
1971                                          0, subvolume);
1972         ret = bkey_err(s.s_c);
1973         if (ret && !bch2_err_matches(ret, ENOENT))
1974                 return ret;
1975 
1976         if (ret) {
1977                 if (fsck_err(trans, dirent_to_missing_subvol,
1978                              "dirent points to missing subvolume\n%s",
1979                              (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1980                         return __remove_dirent(trans, d.k->p);
1981                 ret = 0;
1982                 goto out;
1983         }
1984 
1985         if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1986                         trans, subvol_fs_path_parent_wrong,
1987                         "subvol with wrong fs_path_parent, should be be %u\n%s",
1988                         parent_subvol,
1989                         (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1990                 struct bkey_i_subvolume *n =
1991                         bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1992                 ret = PTR_ERR_OR_ZERO(n);
1993                 if (ret)
1994                         goto err;
1995 
1996                 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1997         }
1998 
1999         u64 target_inum = le64_to_cpu(s.v->inode);
2000         u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2001 
2002         ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
2003         if (ret && !bch2_err_matches(ret, ENOENT))
2004                 goto err;
2005 
2006         if (ret) {
2007                 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2008                 ret = -BCH_ERR_fsck_repair_unimplemented;
2009                 ret = 0;
2010                 goto err;
2011         }
2012 
2013         if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2014                         trans, inode_bi_parent_wrong,
2015                         "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2016                         target_inum,
2017                         subvol_root.bi_parent_subvol, parent_subvol)) {
2018                 subvol_root.bi_parent_subvol = parent_subvol;
2019                 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2020                 if (ret)
2021                         goto err;
2022         }
2023 
2024         ret = check_dirent_target(trans, iter, d, &subvol_root,
2025                                   target_snapshot);
2026         if (ret)
2027                 goto err;
2028 out:
2029 err:
2030 fsck_err:
2031         bch2_trans_iter_exit(trans, &subvol_iter);
2032         printbuf_exit(&buf);
2033         return ret;
2034 }
2035 
2036 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2037                         struct bkey_s_c k,
2038                         struct bch_hash_info *hash_info,
2039                         struct inode_walker *dir,
2040                         struct inode_walker *target,
2041                         struct snapshots_seen *s)
2042 {
2043         struct bch_fs *c = trans->c;
2044         struct inode_walker_entry *i;
2045         struct printbuf buf = PRINTBUF;
2046         int ret = 0;
2047 
2048         ret = bch2_check_key_has_snapshot(trans, iter, k);
2049         if (ret) {
2050                 ret = ret < 0 ? ret : 0;
2051                 goto out;
2052         }
2053 
2054         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2055         if (ret)
2056                 goto err;
2057 
2058         if (k.k->type == KEY_TYPE_whiteout)
2059                 goto out;
2060 
2061         if (dir->last_pos.inode != k.k->p.inode) {
2062                 ret = check_subdir_count(trans, dir);
2063                 if (ret)
2064                         goto err;
2065         }
2066 
2067         i = walk_inode(trans, dir, k);
2068         ret = PTR_ERR_OR_ZERO(i);
2069         if (ret < 0)
2070                 goto err;
2071 
2072         ret = check_key_has_inode(trans, iter, dir, i, k);
2073         if (ret)
2074                 goto err;
2075 
2076         if (!i)
2077                 goto out;
2078 
2079         if (dir->first_this_inode)
2080                 *hash_info = bch2_hash_info_init(c, &i->inode);
2081         dir->first_this_inode = false;
2082 
2083         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2084         if (ret < 0)
2085                 goto err;
2086         if (ret) {
2087                 /* dirent has been deleted */
2088                 ret = 0;
2089                 goto out;
2090         }
2091 
2092         if (k.k->type != KEY_TYPE_dirent)
2093                 goto out;
2094 
2095         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2096 
2097         if (d.v->d_type == DT_SUBVOL) {
2098                 ret = check_dirent_to_subvol(trans, iter, d);
2099                 if (ret)
2100                         goto err;
2101         } else {
2102                 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2103                 if (ret)
2104                         goto err;
2105 
2106                 if (fsck_err_on(!target->inodes.nr,
2107                                 trans, dirent_to_missing_inode,
2108                                 "dirent points to missing inode:\n%s",
2109                                 (printbuf_reset(&buf),
2110                                  bch2_bkey_val_to_text(&buf, c, k),
2111                                  buf.buf))) {
2112                         ret = __remove_dirent(trans, d.k->p);
2113                         if (ret)
2114                                 goto err;
2115                 }
2116 
2117                 darray_for_each(target->inodes, i) {
2118                         ret = check_dirent_target(trans, iter, d,
2119                                                   &i->inode, i->snapshot);
2120                         if (ret)
2121                                 goto err;
2122                 }
2123 
2124                 if (d.v->d_type == DT_DIR)
2125                         for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2126                                 i->count++;
2127         }
2128 out:
2129 err:
2130 fsck_err:
2131         printbuf_exit(&buf);
2132         bch_err_fn(c, ret);
2133         return ret;
2134 }
2135 
2136 /*
2137  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2138  * validate d_type
2139  */
2140 int bch2_check_dirents(struct bch_fs *c)
2141 {
2142         struct inode_walker dir = inode_walker_init();
2143         struct inode_walker target = inode_walker_init();
2144         struct snapshots_seen s;
2145         struct bch_hash_info hash_info;
2146 
2147         snapshots_seen_init(&s);
2148 
2149         int ret = bch2_trans_run(c,
2150                 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2151                                 POS(BCACHEFS_ROOT_INO, 0),
2152                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2153                                 k,
2154                                 NULL, NULL,
2155                                 BCH_TRANS_COMMIT_no_enospc,
2156                         check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2157                 check_subdir_count_notnested(trans, &dir));
2158 
2159         snapshots_seen_exit(&s);
2160         inode_walker_exit(&dir);
2161         inode_walker_exit(&target);
2162         bch_err_fn(c, ret);
2163         return ret;
2164 }
2165 
2166 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2167                        struct bkey_s_c k,
2168                        struct bch_hash_info *hash_info,
2169                        struct inode_walker *inode)
2170 {
2171         struct bch_fs *c = trans->c;
2172         struct inode_walker_entry *i;
2173         int ret;
2174 
2175         ret = bch2_check_key_has_snapshot(trans, iter, k);
2176         if (ret < 0)
2177                 return ret;
2178         if (ret)
2179                 return 0;
2180 
2181         i = walk_inode(trans, inode, k);
2182         ret = PTR_ERR_OR_ZERO(i);
2183         if (ret)
2184                 return ret;
2185 
2186         ret = check_key_has_inode(trans, iter, inode, i, k);
2187         if (ret)
2188                 return ret;
2189 
2190         if (!i)
2191                 return 0;
2192 
2193         if (inode->first_this_inode)
2194                 *hash_info = bch2_hash_info_init(c, &i->inode);
2195         inode->first_this_inode = false;
2196 
2197         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2198         bch_err_fn(c, ret);
2199         return ret;
2200 }
2201 
2202 /*
2203  * Walk xattrs: verify that they all have a corresponding inode
2204  */
2205 int bch2_check_xattrs(struct bch_fs *c)
2206 {
2207         struct inode_walker inode = inode_walker_init();
2208         struct bch_hash_info hash_info;
2209         int ret = 0;
2210 
2211         ret = bch2_trans_run(c,
2212                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2213                         POS(BCACHEFS_ROOT_INO, 0),
2214                         BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2215                         k,
2216                         NULL, NULL,
2217                         BCH_TRANS_COMMIT_no_enospc,
2218                 check_xattr(trans, &iter, k, &hash_info, &inode)));
2219         bch_err_fn(c, ret);
2220         return ret;
2221 }
2222 
2223 static int check_root_trans(struct btree_trans *trans)
2224 {
2225         struct bch_fs *c = trans->c;
2226         struct bch_inode_unpacked root_inode;
2227         u32 snapshot;
2228         u64 inum;
2229         int ret;
2230 
2231         ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2232         if (ret && !bch2_err_matches(ret, ENOENT))
2233                 return ret;
2234 
2235         if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2236                                 "root subvol missing")) {
2237                 struct bkey_i_subvolume *root_subvol =
2238                         bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2239                 ret = PTR_ERR_OR_ZERO(root_subvol);
2240                 if (ret)
2241                         goto err;
2242 
2243                 snapshot        = U32_MAX;
2244                 inum            = BCACHEFS_ROOT_INO;
2245 
2246                 bkey_subvolume_init(&root_subvol->k_i);
2247                 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2248                 root_subvol->v.flags    = 0;
2249                 root_subvol->v.snapshot = cpu_to_le32(snapshot);
2250                 root_subvol->v.inode    = cpu_to_le64(inum);
2251                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2252                 bch_err_msg(c, ret, "writing root subvol");
2253                 if (ret)
2254                         goto err;
2255         }
2256 
2257         ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2258         if (ret && !bch2_err_matches(ret, ENOENT))
2259                 return ret;
2260 
2261         if (mustfix_fsck_err_on(ret,
2262                                 trans, root_dir_missing,
2263                                 "root directory missing") ||
2264             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2265                                 trans, root_inode_not_dir,
2266                                 "root inode not a directory")) {
2267                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2268                                 0, NULL);
2269                 root_inode.bi_inum = inum;
2270 
2271                 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2272                 bch_err_msg(c, ret, "writing root inode");
2273         }
2274 err:
2275 fsck_err:
2276         return ret;
2277 }
2278 
2279 /* Get root directory, create if it doesn't exist: */
2280 int bch2_check_root(struct bch_fs *c)
2281 {
2282         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2283                 check_root_trans(trans));
2284         bch_err_fn(c, ret);
2285         return ret;
2286 }
2287 
2288 typedef DARRAY(u32) darray_u32;
2289 
2290 static bool darray_u32_has(darray_u32 *d, u32 v)
2291 {
2292         darray_for_each(*d, i)
2293                 if (*i == v)
2294                         return true;
2295         return false;
2296 }
2297 
2298 /*
2299  * We've checked that inode backpointers point to valid dirents; here, it's
2300  * sufficient to check that the subvolume root has a dirent:
2301  */
2302 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2303 {
2304         struct bch_inode_unpacked inode;
2305         int ret = bch2_inode_find_by_inum_trans(trans,
2306                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2307                                 &inode);
2308         if (ret)
2309                 return ret;
2310 
2311         return inode.bi_dir != 0;
2312 }
2313 
2314 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2315 {
2316         struct bch_fs *c = trans->c;
2317         struct btree_iter parent_iter = {};
2318         darray_u32 subvol_path = {};
2319         struct printbuf buf = PRINTBUF;
2320         int ret = 0;
2321 
2322         if (k.k->type != KEY_TYPE_subvolume)
2323                 return 0;
2324 
2325         while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2326                 ret = darray_push(&subvol_path, k.k->p.offset);
2327                 if (ret)
2328                         goto err;
2329 
2330                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2331 
2332                 ret = subvol_has_dirent(trans, s);
2333                 if (ret < 0)
2334                         break;
2335 
2336                 if (fsck_err_on(!ret,
2337                                 trans, subvol_unreachable,
2338                                 "unreachable subvolume %s",
2339                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2340                                  buf.buf))) {
2341                         ret = reattach_subvol(trans, s);
2342                         break;
2343                 }
2344 
2345                 u32 parent = le32_to_cpu(s.v->fs_path_parent);
2346 
2347                 if (darray_u32_has(&subvol_path, parent)) {
2348                         if (fsck_err(c, subvol_loop, "subvolume loop"))
2349                                 ret = reattach_subvol(trans, s);
2350                         break;
2351                 }
2352 
2353                 bch2_trans_iter_exit(trans, &parent_iter);
2354                 bch2_trans_iter_init(trans, &parent_iter,
2355                                      BTREE_ID_subvolumes, POS(0, parent), 0);
2356                 k = bch2_btree_iter_peek_slot(&parent_iter);
2357                 ret = bkey_err(k);
2358                 if (ret)
2359                         goto err;
2360 
2361                 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2362                                 trans, subvol_unreachable,
2363                                 "unreachable subvolume %s",
2364                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2365                                  buf.buf))) {
2366                         ret = reattach_subvol(trans, s);
2367                         break;
2368                 }
2369         }
2370 fsck_err:
2371 err:
2372         printbuf_exit(&buf);
2373         darray_exit(&subvol_path);
2374         bch2_trans_iter_exit(trans, &parent_iter);
2375         return ret;
2376 }
2377 
2378 int bch2_check_subvolume_structure(struct bch_fs *c)
2379 {
2380         int ret = bch2_trans_run(c,
2381                 for_each_btree_key_commit(trans, iter,
2382                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2383                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2384                         check_subvol_path(trans, &iter, k)));
2385         bch_err_fn(c, ret);
2386         return ret;
2387 }
2388 
2389 struct pathbuf_entry {
2390         u64     inum;
2391         u32     snapshot;
2392 };
2393 
2394 typedef DARRAY(struct pathbuf_entry) pathbuf;
2395 
2396 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2397 {
2398         darray_for_each(*p, i)
2399                 if (i->inum     == inum &&
2400                     i->snapshot == snapshot)
2401                         return true;
2402         return false;
2403 }
2404 
2405 /*
2406  * Check that a given inode is reachable from its subvolume root - we already
2407  * verified subvolume connectivity:
2408  *
2409  * XXX: we should also be verifying that inodes are in the right subvolumes
2410  */
2411 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2412 {
2413         struct bch_fs *c = trans->c;
2414         struct btree_iter inode_iter = {};
2415         struct bch_inode_unpacked inode;
2416         struct printbuf buf = PRINTBUF;
2417         u32 snapshot = inode_k.k->p.snapshot;
2418         int ret = 0;
2419 
2420         p->nr = 0;
2421 
2422         BUG_ON(bch2_inode_unpack(inode_k, &inode));
2423 
2424         while (!inode.bi_subvol) {
2425                 struct btree_iter dirent_iter;
2426                 struct bkey_s_c_dirent d;
2427                 u32 parent_snapshot = snapshot;
2428 
2429                 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2430                 ret = bkey_err(d.s_c);
2431                 if (ret && !bch2_err_matches(ret, ENOENT))
2432                         break;
2433 
2434                 if (!ret && !dirent_points_to_inode(d, &inode)) {
2435                         bch2_trans_iter_exit(trans, &dirent_iter);
2436                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2437                 }
2438 
2439                 if (bch2_err_matches(ret, ENOENT)) {
2440                         ret = 0;
2441                         if (fsck_err(trans, inode_unreachable,
2442                                      "unreachable inode\n%s",
2443                                      (printbuf_reset(&buf),
2444                                       bch2_bkey_val_to_text(&buf, c, inode_k),
2445                                       buf.buf)))
2446                                 ret = reattach_inode(trans, &inode, snapshot);
2447                         goto out;
2448                 }
2449 
2450                 bch2_trans_iter_exit(trans, &dirent_iter);
2451 
2452                 if (!S_ISDIR(inode.bi_mode))
2453                         break;
2454 
2455                 ret = darray_push(p, ((struct pathbuf_entry) {
2456                         .inum           = inode.bi_inum,
2457                         .snapshot       = snapshot,
2458                 }));
2459                 if (ret)
2460                         return ret;
2461 
2462                 snapshot = parent_snapshot;
2463 
2464                 bch2_trans_iter_exit(trans, &inode_iter);
2465                 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2466                                              SPOS(0, inode.bi_dir, snapshot), 0);
2467                 ret = bkey_err(inode_k) ?:
2468                         !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2469                         : bch2_inode_unpack(inode_k, &inode);
2470                 if (ret) {
2471                         /* Should have been caught in dirents pass */
2472                         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2473                                 bch_err(c, "error looking up parent directory: %i", ret);
2474                         break;
2475                 }
2476 
2477                 snapshot = inode_k.k->p.snapshot;
2478 
2479                 if (path_is_dup(p, inode.bi_inum, snapshot)) {
2480                         /* XXX print path */
2481                         bch_err(c, "directory structure loop");
2482 
2483                         darray_for_each(*p, i)
2484                                 pr_err("%llu:%u", i->inum, i->snapshot);
2485                         pr_err("%llu:%u", inode.bi_inum, snapshot);
2486 
2487                         if (fsck_err(trans, dir_loop, "directory structure loop")) {
2488                                 ret = remove_backpointer(trans, &inode);
2489                                 bch_err_msg(c, ret, "removing dirent");
2490                                 if (ret)
2491                                         break;
2492 
2493                                 ret = reattach_inode(trans, &inode, snapshot);
2494                                 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2495                         }
2496                         break;
2497                 }
2498         }
2499 out:
2500 fsck_err:
2501         bch2_trans_iter_exit(trans, &inode_iter);
2502         printbuf_exit(&buf);
2503         bch_err_fn(c, ret);
2504         return ret;
2505 }
2506 
2507 /*
2508  * Check for unreachable inodes, as well as loops in the directory structure:
2509  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2510  * unreachable:
2511  */
2512 int bch2_check_directory_structure(struct bch_fs *c)
2513 {
2514         pathbuf path = { 0, };
2515         int ret;
2516 
2517         ret = bch2_trans_run(c,
2518                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2519                                           BTREE_ITER_intent|
2520                                           BTREE_ITER_prefetch|
2521                                           BTREE_ITER_all_snapshots, k,
2522                                           NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2523                         if (!bkey_is_inode(k.k))
2524                                 continue;
2525 
2526                         if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2527                                 continue;
2528 
2529                         check_path(trans, &path, k);
2530                 })));
2531         darray_exit(&path);
2532 
2533         bch_err_fn(c, ret);
2534         return ret;
2535 }
2536 
2537 struct nlink_table {
2538         size_t          nr;
2539         size_t          size;
2540 
2541         struct nlink {
2542                 u64     inum;
2543                 u32     snapshot;
2544                 u32     count;
2545         }               *d;
2546 };
2547 
2548 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2549                      u64 inum, u32 snapshot)
2550 {
2551         if (t->nr == t->size) {
2552                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2553                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2554 
2555                 if (!d) {
2556                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2557                                 new_size);
2558                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2559                 }
2560 
2561                 if (t->d)
2562                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2563                 kvfree(t->d);
2564 
2565                 t->d = d;
2566                 t->size = new_size;
2567         }
2568 
2569 
2570         t->d[t->nr++] = (struct nlink) {
2571                 .inum           = inum,
2572                 .snapshot       = snapshot,
2573         };
2574 
2575         return 0;
2576 }
2577 
2578 static int nlink_cmp(const void *_l, const void *_r)
2579 {
2580         const struct nlink *l = _l;
2581         const struct nlink *r = _r;
2582 
2583         return cmp_int(l->inum, r->inum);
2584 }
2585 
2586 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2587                      struct nlink_table *links,
2588                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2589 {
2590         struct nlink *link, key = {
2591                 .inum = inum, .snapshot = U32_MAX,
2592         };
2593 
2594         if (inum < range_start || inum >= range_end)
2595                 return;
2596 
2597         link = __inline_bsearch(&key, links->d, links->nr,
2598                                 sizeof(links->d[0]), nlink_cmp);
2599         if (!link)
2600                 return;
2601 
2602         while (link > links->d && link[0].inum == link[-1].inum)
2603                 --link;
2604 
2605         for (; link < links->d + links->nr && link->inum == inum; link++)
2606                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2607                         link->count++;
2608                         if (link->snapshot >= snapshot)
2609                                 break;
2610                 }
2611 }
2612 
2613 noinline_for_stack
2614 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2615                                        struct nlink_table *t,
2616                                        u64 start, u64 *end)
2617 {
2618         int ret = bch2_trans_run(c,
2619                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2620                                    POS(0, start),
2621                                    BTREE_ITER_intent|
2622                                    BTREE_ITER_prefetch|
2623                                    BTREE_ITER_all_snapshots, k, ({
2624                         if (!bkey_is_inode(k.k))
2625                                 continue;
2626 
2627                         /* Should never fail, checked by bch2_inode_invalid: */
2628                         struct bch_inode_unpacked u;
2629                         BUG_ON(bch2_inode_unpack(k, &u));
2630 
2631                         /*
2632                          * Backpointer and directory structure checks are sufficient for
2633                          * directories, since they can't have hardlinks:
2634                          */
2635                         if (S_ISDIR(u.bi_mode))
2636                                 continue;
2637 
2638                         if (!u.bi_nlink)
2639                                 continue;
2640 
2641                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2642                         if (ret) {
2643                                 *end = k.k->p.offset;
2644                                 ret = 0;
2645                                 break;
2646                         }
2647                         0;
2648                 })));
2649 
2650         bch_err_fn(c, ret);
2651         return ret;
2652 }
2653 
2654 noinline_for_stack
2655 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2656                                      u64 range_start, u64 range_end)
2657 {
2658         struct snapshots_seen s;
2659 
2660         snapshots_seen_init(&s);
2661 
2662         int ret = bch2_trans_run(c,
2663                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2664                                    BTREE_ITER_intent|
2665                                    BTREE_ITER_prefetch|
2666                                    BTREE_ITER_all_snapshots, k, ({
2667                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2668                         if (ret)
2669                                 break;
2670 
2671                         if (k.k->type == KEY_TYPE_dirent) {
2672                                 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2673 
2674                                 if (d.v->d_type != DT_DIR &&
2675                                     d.v->d_type != DT_SUBVOL)
2676                                         inc_link(c, &s, links, range_start, range_end,
2677                                                  le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2678                         }
2679                         0;
2680                 })));
2681 
2682         snapshots_seen_exit(&s);
2683 
2684         bch_err_fn(c, ret);
2685         return ret;
2686 }
2687 
2688 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2689                                      struct bkey_s_c k,
2690                                      struct nlink_table *links,
2691                                      size_t *idx, u64 range_end)
2692 {
2693         struct bch_inode_unpacked u;
2694         struct nlink *link = &links->d[*idx];
2695         int ret = 0;
2696 
2697         if (k.k->p.offset >= range_end)
2698                 return 1;
2699 
2700         if (!bkey_is_inode(k.k))
2701                 return 0;
2702 
2703         BUG_ON(bch2_inode_unpack(k, &u));
2704 
2705         if (S_ISDIR(u.bi_mode))
2706                 return 0;
2707 
2708         if (!u.bi_nlink)
2709                 return 0;
2710 
2711         while ((cmp_int(link->inum, k.k->p.offset) ?:
2712                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2713                 BUG_ON(*idx == links->nr);
2714                 link = &links->d[++*idx];
2715         }
2716 
2717         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2718                         trans, inode_wrong_nlink,
2719                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2720                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2721                         bch2_inode_nlink_get(&u), link->count)) {
2722                 bch2_inode_nlink_set(&u, link->count);
2723                 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2724         }
2725 fsck_err:
2726         return ret;
2727 }
2728 
2729 noinline_for_stack
2730 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2731                                struct nlink_table *links,
2732                                u64 range_start, u64 range_end)
2733 {
2734         size_t idx = 0;
2735 
2736         int ret = bch2_trans_run(c,
2737                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2738                                 POS(0, range_start),
2739                                 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2740                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2741                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2742         if (ret < 0) {
2743                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2744                 return ret;
2745         }
2746 
2747         return 0;
2748 }
2749 
2750 int bch2_check_nlinks(struct bch_fs *c)
2751 {
2752         struct nlink_table links = { 0 };
2753         u64 this_iter_range_start, next_iter_range_start = 0;
2754         int ret = 0;
2755 
2756         do {
2757                 this_iter_range_start = next_iter_range_start;
2758                 next_iter_range_start = U64_MAX;
2759 
2760                 ret = check_nlinks_find_hardlinks(c, &links,
2761                                                   this_iter_range_start,
2762                                                   &next_iter_range_start);
2763 
2764                 ret = check_nlinks_walk_dirents(c, &links,
2765                                           this_iter_range_start,
2766                                           next_iter_range_start);
2767                 if (ret)
2768                         break;
2769 
2770                 ret = check_nlinks_update_hardlinks(c, &links,
2771                                          this_iter_range_start,
2772                                          next_iter_range_start);
2773                 if (ret)
2774                         break;
2775 
2776                 links.nr = 0;
2777         } while (next_iter_range_start != U64_MAX);
2778 
2779         kvfree(links.d);
2780         bch_err_fn(c, ret);
2781         return ret;
2782 }
2783 
2784 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2785                              struct bkey_s_c k)
2786 {
2787         struct bkey_s_c_reflink_p p;
2788         struct bkey_i_reflink_p *u;
2789 
2790         if (k.k->type != KEY_TYPE_reflink_p)
2791                 return 0;
2792 
2793         p = bkey_s_c_to_reflink_p(k);
2794 
2795         if (!p.v->front_pad && !p.v->back_pad)
2796                 return 0;
2797 
2798         u = bch2_trans_kmalloc(trans, sizeof(*u));
2799         int ret = PTR_ERR_OR_ZERO(u);
2800         if (ret)
2801                 return ret;
2802 
2803         bkey_reassemble(&u->k_i, k);
2804         u->v.front_pad  = 0;
2805         u->v.back_pad   = 0;
2806 
2807         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2808 }
2809 
2810 int bch2_fix_reflink_p(struct bch_fs *c)
2811 {
2812         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2813                 return 0;
2814 
2815         int ret = bch2_trans_run(c,
2816                 for_each_btree_key_commit(trans, iter,
2817                                 BTREE_ID_extents, POS_MIN,
2818                                 BTREE_ITER_intent|BTREE_ITER_prefetch|
2819                                 BTREE_ITER_all_snapshots, k,
2820                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2821                         fix_reflink_p_key(trans, &iter, k)));
2822         bch_err_fn(c, ret);
2823         return ret;
2824 }
2825 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php