~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/fsck.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include "bcachefs.h"
  4 #include "bkey_buf.h"
  5 #include "btree_cache.h"
  6 #include "btree_update.h"
  7 #include "buckets.h"
  8 #include "darray.h"
  9 #include "dirent.h"
 10 #include "error.h"
 11 #include "fs.h"
 12 #include "fs-common.h"
 13 #include "fsck.h"
 14 #include "inode.h"
 15 #include "keylist.h"
 16 #include "recovery_passes.h"
 17 #include "snapshot.h"
 18 #include "super.h"
 19 #include "xattr.h"
 20 
 21 #include <linux/bsearch.h>
 22 #include <linux/dcache.h> /* struct qstr */
 23 
 24 /*
 25  * XXX: this is handling transaction restarts without returning
 26  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
 27  */
 28 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
 29                                     u32 snapshot)
 30 {
 31         u64 sectors = 0;
 32 
 33         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
 34                                 SPOS(inum, 0, snapshot),
 35                                 POS(inum, U64_MAX),
 36                                 0, k, ({
 37                 if (bkey_extent_is_allocation(k.k))
 38                         sectors += k.k->size;
 39                 0;
 40         }));
 41 
 42         return ret ?: sectors;
 43 }
 44 
 45 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
 46                                     u32 snapshot)
 47 {
 48         u64 subdirs = 0;
 49 
 50         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
 51                                     SPOS(inum, 0, snapshot),
 52                                     POS(inum, U64_MAX),
 53                                     0, k, ({
 54                 if (k.k->type == KEY_TYPE_dirent &&
 55                     bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
 56                         subdirs++;
 57                 0;
 58         }));
 59 
 60         return ret ?: subdirs;
 61 }
 62 
 63 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
 64                          u32 *snapshot, u64 *inum)
 65 {
 66         struct bch_subvolume s;
 67         int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
 68 
 69         *snapshot = le32_to_cpu(s.snapshot);
 70         *inum = le64_to_cpu(s.inode);
 71         return ret;
 72 }
 73 
 74 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 75                               struct bch_inode_unpacked *inode)
 76 {
 77         struct btree_iter iter;
 78         struct bkey_s_c k;
 79         int ret;
 80 
 81         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
 82                                      BTREE_ITER_all_snapshots, k, ret) {
 83                 if (k.k->p.offset != inode_nr)
 84                         break;
 85                 if (!bkey_is_inode(k.k))
 86                         continue;
 87                 ret = bch2_inode_unpack(k, inode);
 88                 goto found;
 89         }
 90         ret = -BCH_ERR_ENOENT_inode;
 91 found:
 92         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
 93         bch2_trans_iter_exit(trans, &iter);
 94         return ret;
 95 }
 96 
 97 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
 98                         struct bch_inode_unpacked *inode,
 99                         u32 *snapshot)
100 {
101         struct btree_iter iter;
102         struct bkey_s_c k;
103         int ret;
104 
105         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
106                                SPOS(0, inode_nr, *snapshot), 0);
107         ret = bkey_err(k);
108         if (ret)
109                 goto err;
110 
111         ret = bkey_is_inode(k.k)
112                 ? bch2_inode_unpack(k, inode)
113                 : -BCH_ERR_ENOENT_inode;
114         if (!ret)
115                 *snapshot = iter.pos.snapshot;
116 err:
117         bch2_trans_iter_exit(trans, &iter);
118         return ret;
119 }
120 
121 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
122                            struct bch_hash_info hash_info,
123                            subvol_inum dir, struct qstr *name,
124                            u64 *target, unsigned *type, u32 snapshot)
125 {
126         struct btree_iter iter;
127         struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
128                                                          &hash_info, dir, name, 0, snapshot);
129         int ret = bkey_err(k);
130         if (ret)
131                 return ret;
132 
133         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
134         *target = le64_to_cpu(d.v->d_inum);
135         *type = d.v->d_type;
136         bch2_trans_iter_exit(trans, &iter);
137         return 0;
138 }
139 
140 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
141 {
142         struct bch_fs *c = trans->c;
143         struct btree_iter iter;
144         struct bch_inode_unpacked dir_inode;
145         struct bch_hash_info dir_hash_info;
146         int ret;
147 
148         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
149         if (ret)
150                 goto err;
151 
152         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
153 
154         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
155 
156         ret =   bch2_btree_iter_traverse(&iter) ?:
157                 bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
158                                     &dir_hash_info, &iter,
159                                     BTREE_UPDATE_internal_snapshot_node);
160         bch2_trans_iter_exit(trans, &iter);
161 err:
162         bch_err_fn(c, ret);
163         return ret;
164 }
165 
166 /* Get lost+found, create if it doesn't exist: */
167 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
168                             struct bch_inode_unpacked *lostfound,
169                             u64 reattaching_inum)
170 {
171         struct bch_fs *c = trans->c;
172         struct qstr lostfound_str = QSTR("lost+found");
173         u64 inum = 0;
174         unsigned d_type = 0;
175         int ret;
176 
177         struct bch_snapshot_tree st;
178         ret = bch2_snapshot_tree_lookup(trans,
179                         bch2_snapshot_tree(c, snapshot), &st);
180         if (ret)
181                 return ret;
182 
183         subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
184 
185         struct bch_subvolume subvol;
186         ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
187                                  false, 0, &subvol);
188         bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
189                     le32_to_cpu(st.master_subvol), snapshot);
190         if (ret)
191                 return ret;
192 
193         if (!subvol.inode) {
194                 struct btree_iter iter;
195                 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
196                                 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
197                                 0, subvolume);
198                 ret = PTR_ERR_OR_ZERO(subvol);
199                 if (ret)
200                         return ret;
201 
202                 subvol->v.inode = cpu_to_le64(reattaching_inum);
203                 bch2_trans_iter_exit(trans, &iter);
204         }
205 
206         root_inum.inum = le64_to_cpu(subvol.inode);
207 
208         struct bch_inode_unpacked root_inode;
209         struct bch_hash_info root_hash_info;
210         u32 root_inode_snapshot = snapshot;
211         ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
212         bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
213                     root_inum.inum, le32_to_cpu(st.master_subvol));
214         if (ret)
215                 return ret;
216 
217         root_hash_info = bch2_hash_info_init(c, &root_inode);
218 
219         ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
220                               &lostfound_str, &inum, &d_type, snapshot);
221         if (bch2_err_matches(ret, ENOENT))
222                 goto create_lostfound;
223 
224         bch_err_fn(c, ret);
225         if (ret)
226                 return ret;
227 
228         if (d_type != DT_DIR) {
229                 bch_err(c, "error looking up lost+found: not a directory");
230                 return -BCH_ERR_ENOENT_not_directory;
231         }
232 
233         /*
234          * The bch2_check_dirents pass has already run, dangling dirents
235          * shouldn't exist here:
236          */
237         ret = lookup_inode(trans, inum, lostfound, &snapshot);
238         bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
239                     inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
240         return ret;
241 
242 create_lostfound:
243         /*
244          * XXX: we could have a nicer log message here  if we had a nice way to
245          * walk backpointers to print a path
246          */
247         bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
248 
249         u64 now = bch2_current_time(c);
250         struct btree_iter lostfound_iter = { NULL };
251         u64 cpu = raw_smp_processor_id();
252 
253         bch2_inode_init_early(c, lostfound);
254         bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
255         lostfound->bi_dir = root_inode.bi_inum;
256 
257         root_inode.bi_nlink++;
258 
259         ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
260         if (ret)
261                 goto err;
262 
263         bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
264         ret = bch2_btree_iter_traverse(&lostfound_iter);
265         if (ret)
266                 goto err;
267 
268         ret =   bch2_dirent_create_snapshot(trans,
269                                 0, root_inode.bi_inum, snapshot, &root_hash_info,
270                                 mode_to_type(lostfound->bi_mode),
271                                 &lostfound_str,
272                                 lostfound->bi_inum,
273                                 &lostfound->bi_dir_offset,
274                                 STR_HASH_must_create) ?:
275                 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
276                                        BTREE_UPDATE_internal_snapshot_node);
277 err:
278         bch_err_msg(c, ret, "creating lost+found");
279         bch2_trans_iter_exit(trans, &lostfound_iter);
280         return ret;
281 }
282 
283 static int reattach_inode(struct btree_trans *trans,
284                           struct bch_inode_unpacked *inode,
285                           u32 inode_snapshot)
286 {
287         struct bch_fs *c = trans->c;
288         struct bch_hash_info dir_hash;
289         struct bch_inode_unpacked lostfound;
290         char name_buf[20];
291         struct qstr name;
292         u64 dir_offset = 0;
293         u32 dirent_snapshot = inode_snapshot;
294         int ret;
295 
296         if (inode->bi_subvol) {
297                 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
298 
299                 u64 root_inum;
300                 ret = subvol_lookup(trans, inode->bi_parent_subvol,
301                                     &dirent_snapshot, &root_inum);
302                 if (ret)
303                         return ret;
304 
305                 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
306         } else {
307                 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
308         }
309 
310         ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
311         if (ret)
312                 return ret;
313 
314         if (S_ISDIR(inode->bi_mode)) {
315                 lostfound.bi_nlink++;
316 
317                 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
318                 if (ret)
319                         return ret;
320         }
321 
322         dir_hash = bch2_hash_info_init(c, &lostfound);
323 
324         name = (struct qstr) QSTR(name_buf);
325 
326         ret = bch2_dirent_create_snapshot(trans,
327                                 inode->bi_parent_subvol, lostfound.bi_inum,
328                                 dirent_snapshot,
329                                 &dir_hash,
330                                 inode_d_type(inode),
331                                 &name,
332                                 inode->bi_subvol ?: inode->bi_inum,
333                                 &dir_offset,
334                                 STR_HASH_must_create);
335         if (ret) {
336                 bch_err_msg(c, ret, "error creating dirent");
337                 return ret;
338         }
339 
340         inode->bi_dir           = lostfound.bi_inum;
341         inode->bi_dir_offset    = dir_offset;
342 
343         return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
344 }
345 
346 static int remove_backpointer(struct btree_trans *trans,
347                               struct bch_inode_unpacked *inode)
348 {
349         struct btree_iter iter;
350         struct bkey_s_c_dirent d;
351         int ret;
352 
353         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
354                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
355                                      dirent);
356         ret =   bkey_err(d) ?:
357                 __remove_dirent(trans, d.k->p);
358         bch2_trans_iter_exit(trans, &iter);
359         return ret;
360 }
361 
362 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
363 {
364         struct bch_fs *c = trans->c;
365 
366         struct bch_inode_unpacked inode;
367         int ret = bch2_inode_find_by_inum_trans(trans,
368                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
369                                 &inode);
370         if (ret)
371                 return ret;
372 
373         ret = remove_backpointer(trans, &inode);
374         bch_err_msg(c, ret, "removing dirent");
375         if (ret)
376                 return ret;
377 
378         ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
379         bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
380         return ret;
381 }
382 
383 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
384 {
385         struct bch_fs *c = trans->c;
386 
387         if (!bch2_snapshot_is_leaf(c, snapshotid)) {
388                 bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
389                 return -BCH_ERR_fsck_repair_unimplemented;
390         }
391 
392         /*
393          * If inum isn't set, that means we're being called from check_dirents,
394          * not check_inodes - the root of this subvolume doesn't exist or we
395          * would have found it there:
396          */
397         if (!inum) {
398                 struct btree_iter inode_iter = {};
399                 struct bch_inode_unpacked new_inode;
400                 u64 cpu = raw_smp_processor_id();
401 
402                 bch2_inode_init_early(c, &new_inode);
403                 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
404 
405                 new_inode.bi_subvol = subvolid;
406 
407                 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
408                           bch2_btree_iter_traverse(&inode_iter) ?:
409                           bch2_inode_write(trans, &inode_iter, &new_inode);
410                 bch2_trans_iter_exit(trans, &inode_iter);
411                 if (ret)
412                         return ret;
413 
414                 inum = new_inode.bi_inum;
415         }
416 
417         bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
418 
419         struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
420         int ret = PTR_ERR_OR_ZERO(new_subvol);
421         if (ret)
422                 return ret;
423 
424         bkey_subvolume_init(&new_subvol->k_i);
425         new_subvol->k.p.offset  = subvolid;
426         new_subvol->v.snapshot  = cpu_to_le32(snapshotid);
427         new_subvol->v.inode     = cpu_to_le64(inum);
428         ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
429         if (ret)
430                 return ret;
431 
432         struct btree_iter iter;
433         struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
434                         BTREE_ID_snapshots, POS(0, snapshotid),
435                         0, snapshot);
436         ret = PTR_ERR_OR_ZERO(s);
437         bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
438         if (ret)
439                 return ret;
440 
441         u32 snapshot_tree = le32_to_cpu(s->v.tree);
442 
443         s->v.subvol = cpu_to_le32(subvolid);
444         SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
445         bch2_trans_iter_exit(trans, &iter);
446 
447         struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
448                         BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
449                         0, snapshot_tree);
450         ret = PTR_ERR_OR_ZERO(st);
451         bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
452         if (ret)
453                 return ret;
454 
455         if (!st->v.master_subvol)
456                 st->v.master_subvol = cpu_to_le32(subvolid);
457 
458         bch2_trans_iter_exit(trans, &iter);
459         return 0;
460 }
461 
462 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
463 {
464         struct bch_fs *c = trans->c;
465         unsigned i_mode = S_IFREG;
466         u64 i_size = 0;
467 
468         switch (btree) {
469         case BTREE_ID_extents: {
470                 struct btree_iter iter = {};
471 
472                 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
473                 struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
474                 bch2_trans_iter_exit(trans, &iter);
475                 int ret = bkey_err(k);
476                 if (ret)
477                         return ret;
478 
479                 i_size = k.k->p.offset << 9;
480                 break;
481         }
482         case BTREE_ID_dirents:
483                 i_mode = S_IFDIR;
484                 break;
485         case BTREE_ID_xattrs:
486                 break;
487         default:
488                 BUG();
489         }
490 
491         struct bch_inode_unpacked new_inode;
492         bch2_inode_init_early(c, &new_inode);
493         bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
494         new_inode.bi_size = i_size;
495         new_inode.bi_inum = inum;
496 
497         return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
498 }
499 
500 struct snapshots_seen {
501         struct bpos                     pos;
502         snapshot_id_list                ids;
503 };
504 
505 static inline void snapshots_seen_exit(struct snapshots_seen *s)
506 {
507         darray_exit(&s->ids);
508 }
509 
510 static inline void snapshots_seen_init(struct snapshots_seen *s)
511 {
512         memset(s, 0, sizeof(*s));
513 }
514 
515 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
516 {
517         u32 *i;
518         __darray_for_each(s->ids, i) {
519                 if (*i == id)
520                         return 0;
521                 if (*i > id)
522                         break;
523         }
524 
525         int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
526         if (ret)
527                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
528                         s->ids.size);
529         return ret;
530 }
531 
532 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
533                                  enum btree_id btree_id, struct bpos pos)
534 {
535         if (!bkey_eq(s->pos, pos))
536                 s->ids.nr = 0;
537         s->pos = pos;
538 
539         return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
540 }
541 
542 /**
543  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
544  * and @ancestor hasn't been overwritten in @seen
545  *
546  * @c:          filesystem handle
547  * @seen:       list of snapshot ids already seen at current position
548  * @id:         descendent snapshot id
549  * @ancestor:   ancestor snapshot id
550  *
551  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
552  */
553 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
554                                     u32 id, u32 ancestor)
555 {
556         ssize_t i;
557 
558         EBUG_ON(id > ancestor);
559 
560         /* @ancestor should be the snapshot most recently added to @seen */
561         EBUG_ON(ancestor != seen->pos.snapshot);
562         EBUG_ON(ancestor != darray_last(seen->ids));
563 
564         if (id == ancestor)
565                 return true;
566 
567         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
568                 return false;
569 
570         /*
571          * We know that @id is a descendant of @ancestor, we're checking if
572          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
573          * @ascestor and with @id as a descendent.
574          *
575          * But we already know that we're scanning IDs between @id and @ancestor
576          * numerically, since snapshot ID lists are kept sorted, so if we find
577          * an id that's an ancestor of @id we're done:
578          */
579 
580         for (i = seen->ids.nr - 2;
581              i >= 0 && seen->ids.data[i] >= id;
582              --i)
583                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
584                         return false;
585 
586         return true;
587 }
588 
589 /**
590  * ref_visible - given a key with snapshot id @src that points to a key with
591  * snapshot id @dst, test whether there is some snapshot in which @dst is
592  * visible.
593  *
594  * @c:          filesystem handle
595  * @s:          list of snapshot IDs already seen at @src
596  * @src:        snapshot ID of src key
597  * @dst:        snapshot ID of dst key
598  * Returns:     true if there is some snapshot in which @dst is visible
599  *
600  * Assumes we're visiting @src keys in natural key order
601  */
602 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
603                         u32 src, u32 dst)
604 {
605         return dst <= src
606                 ? key_visible_in_snapshot(c, s, dst, src)
607                 : bch2_snapshot_is_ancestor(c, src, dst);
608 }
609 
610 static int ref_visible2(struct bch_fs *c,
611                         u32 src, struct snapshots_seen *src_seen,
612                         u32 dst, struct snapshots_seen *dst_seen)
613 {
614         if (dst > src) {
615                 swap(dst, src);
616                 swap(dst_seen, src_seen);
617         }
618         return key_visible_in_snapshot(c, src_seen, dst, src);
619 }
620 
621 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
622         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
623              (_i)->snapshot <= (_snapshot); _i++)                                       \
624                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
625 
626 struct inode_walker_entry {
627         struct bch_inode_unpacked inode;
628         u32                     snapshot;
629         bool                    seen_this_pos;
630         u64                     count;
631 };
632 
633 struct inode_walker {
634         bool                            first_this_inode;
635         bool                            recalculate_sums;
636         struct bpos                     last_pos;
637 
638         DARRAY(struct inode_walker_entry) inodes;
639 };
640 
641 static void inode_walker_exit(struct inode_walker *w)
642 {
643         darray_exit(&w->inodes);
644 }
645 
646 static struct inode_walker inode_walker_init(void)
647 {
648         return (struct inode_walker) { 0, };
649 }
650 
651 static int add_inode(struct bch_fs *c, struct inode_walker *w,
652                      struct bkey_s_c inode)
653 {
654         struct bch_inode_unpacked u;
655 
656         BUG_ON(bch2_inode_unpack(inode, &u));
657 
658         return darray_push(&w->inodes, ((struct inode_walker_entry) {
659                 .inode          = u,
660                 .snapshot       = inode.k->p.snapshot,
661         }));
662 }
663 
664 static int get_inodes_all_snapshots(struct btree_trans *trans,
665                                     struct inode_walker *w, u64 inum)
666 {
667         struct bch_fs *c = trans->c;
668         struct btree_iter iter;
669         struct bkey_s_c k;
670         int ret;
671 
672         w->recalculate_sums = false;
673         w->inodes.nr = 0;
674 
675         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
676                                      BTREE_ITER_all_snapshots, k, ret) {
677                 if (k.k->p.offset != inum)
678                         break;
679 
680                 if (bkey_is_inode(k.k))
681                         add_inode(c, w, k);
682         }
683         bch2_trans_iter_exit(trans, &iter);
684 
685         if (ret)
686                 return ret;
687 
688         w->first_this_inode = true;
689         return 0;
690 }
691 
692 static struct inode_walker_entry *
693 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
694 {
695         bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
696 
697         struct inode_walker_entry *i;
698         __darray_for_each(w->inodes, i)
699                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
700                         goto found;
701 
702         return NULL;
703 found:
704         BUG_ON(k.k->p.snapshot > i->snapshot);
705 
706         if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
707                 struct inode_walker_entry new = *i;
708 
709                 new.snapshot = k.k->p.snapshot;
710                 new.count = 0;
711 
712                 struct printbuf buf = PRINTBUF;
713                 bch2_bkey_val_to_text(&buf, c, k);
714 
715                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
716                          "unexpected because we should always update the inode when we update a key in that inode\n"
717                          "%s",
718                          w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
719                 printbuf_exit(&buf);
720 
721                 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
722                         --i;
723 
724                 size_t pos = i - w->inodes.data;
725                 int ret = darray_insert_item(&w->inodes, pos, new);
726                 if (ret)
727                         return ERR_PTR(ret);
728 
729                 i = w->inodes.data + pos;
730         }
731 
732         return i;
733 }
734 
735 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
736                                              struct inode_walker *w,
737                                              struct bkey_s_c k)
738 {
739         if (w->last_pos.inode != k.k->p.inode) {
740                 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
741                 if (ret)
742                         return ERR_PTR(ret);
743         } else if (bkey_cmp(w->last_pos, k.k->p)) {
744                 darray_for_each(w->inodes, i)
745                         i->seen_this_pos = false;
746         }
747 
748         w->last_pos = k.k->p;
749 
750         return lookup_inode_for_snapshot(trans->c, w, k);
751 }
752 
753 static int get_visible_inodes(struct btree_trans *trans,
754                               struct inode_walker *w,
755                               struct snapshots_seen *s,
756                               u64 inum)
757 {
758         struct bch_fs *c = trans->c;
759         struct btree_iter iter;
760         struct bkey_s_c k;
761         int ret;
762 
763         w->inodes.nr = 0;
764 
765         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
766                            BTREE_ITER_all_snapshots, k, ret) {
767                 if (k.k->p.offset != inum)
768                         break;
769 
770                 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
771                         continue;
772 
773                 if (bkey_is_inode(k.k))
774                         add_inode(c, w, k);
775 
776                 if (k.k->p.snapshot >= s->pos.snapshot)
777                         break;
778         }
779         bch2_trans_iter_exit(trans, &iter);
780 
781         return ret;
782 }
783 
784 static int hash_redo_key(struct btree_trans *trans,
785                          const struct bch_hash_desc desc,
786                          struct bch_hash_info *hash_info,
787                          struct btree_iter *k_iter, struct bkey_s_c k)
788 {
789         struct bkey_i *delete;
790         struct bkey_i *tmp;
791 
792         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
793         if (IS_ERR(delete))
794                 return PTR_ERR(delete);
795 
796         tmp = bch2_bkey_make_mut_noupdate(trans, k);
797         if (IS_ERR(tmp))
798                 return PTR_ERR(tmp);
799 
800         bkey_init(&delete->k);
801         delete->k.p = k_iter->pos;
802         return  bch2_btree_iter_traverse(k_iter) ?:
803                 bch2_trans_update(trans, k_iter, delete, 0) ?:
804                 bch2_hash_set_in_snapshot(trans, desc, hash_info,
805                                        (subvol_inum) { 0, k.k->p.inode },
806                                        k.k->p.snapshot, tmp,
807                                        STR_HASH_must_create|
808                                        BTREE_UPDATE_internal_snapshot_node) ?:
809                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
810 }
811 
812 static int hash_check_key(struct btree_trans *trans,
813                           const struct bch_hash_desc desc,
814                           struct bch_hash_info *hash_info,
815                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
816 {
817         struct bch_fs *c = trans->c;
818         struct btree_iter iter = { NULL };
819         struct printbuf buf = PRINTBUF;
820         struct bkey_s_c k;
821         u64 hash;
822         int ret = 0;
823 
824         if (hash_k.k->type != desc.key_type)
825                 return 0;
826 
827         hash = desc.hash_bkey(hash_info, hash_k);
828 
829         if (likely(hash == hash_k.k->p.offset))
830                 return 0;
831 
832         if (hash_k.k->p.offset < hash)
833                 goto bad_hash;
834 
835         for_each_btree_key_norestart(trans, iter, desc.btree_id,
836                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
837                                      BTREE_ITER_slots, k, ret) {
838                 if (bkey_eq(k.k->p, hash_k.k->p))
839                         break;
840 
841                 if (fsck_err_on(k.k->type == desc.key_type &&
842                                 !desc.cmp_bkey(k, hash_k),
843                                 trans, hash_table_key_duplicate,
844                                 "duplicate hash table keys:\n%s",
845                                 (printbuf_reset(&buf),
846                                  bch2_bkey_val_to_text(&buf, c, hash_k),
847                                  buf.buf))) {
848                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
849                         break;
850                 }
851 
852                 if (bkey_deleted(k.k)) {
853                         bch2_trans_iter_exit(trans, &iter);
854                         goto bad_hash;
855                 }
856         }
857 out:
858         bch2_trans_iter_exit(trans, &iter);
859         printbuf_exit(&buf);
860         return ret;
861 bad_hash:
862         if (fsck_err(trans, hash_table_key_wrong_offset,
863                      "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
864                      bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
865                      (printbuf_reset(&buf),
866                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
867                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
868                 bch_err_fn(c, ret);
869                 if (ret)
870                         return ret;
871                 ret = -BCH_ERR_transaction_restart_nested;
872         }
873 fsck_err:
874         goto out;
875 }
876 
877 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
878                                                 struct btree_iter *iter,
879                                                 struct bpos pos)
880 {
881         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
882 }
883 
884 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
885                                                struct btree_iter *iter,
886                                                struct bch_inode_unpacked *inode,
887                                                u32 *snapshot)
888 {
889         if (inode->bi_subvol) {
890                 u64 inum;
891                 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
892                 if (ret)
893                         return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
894         }
895 
896         return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
897 }
898 
899 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
900                                    struct bkey_s_c_dirent d)
901 {
902         return  inode->bi_dir           == d.k->p.inode &&
903                 inode->bi_dir_offset    == d.k->p.offset;
904 }
905 
906 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
907                                    struct bch_inode_unpacked *inode)
908 {
909         return d.v->d_type == DT_SUBVOL
910                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
911                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
912 }
913 
914 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
915 {
916         struct btree_iter iter;
917         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
918         int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
919         bch2_trans_iter_exit(trans, &iter);
920         return ret;
921 }
922 
923 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
924                                     struct bch_inode_unpacked *inode,
925                                     u32 inode_snapshot, bool *write_inode)
926 {
927         struct bch_fs *c = trans->c;
928         struct printbuf buf = PRINTBUF;
929 
930         struct btree_iter dirent_iter = {};
931         struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
932         int ret = bkey_err(d);
933         if (ret && !bch2_err_matches(ret, ENOENT))
934                 return ret;
935 
936         if (fsck_err_on(ret,
937                         trans, inode_points_to_missing_dirent,
938                         "inode points to missing dirent\n%s",
939                         (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
940             fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
941                         trans, inode_points_to_wrong_dirent,
942                         "inode points to dirent that does not point back:\n%s",
943                         (bch2_bkey_val_to_text(&buf, c, inode_k),
944                          prt_newline(&buf),
945                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
946                 /*
947                  * We just clear the backpointer fields for now. If we find a
948                  * dirent that points to this inode in check_dirents(), we'll
949                  * update it then; then when we get to check_path() if the
950                  * backpointer is still 0 we'll reattach it.
951                  */
952                 inode->bi_dir = 0;
953                 inode->bi_dir_offset = 0;
954                 inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
955                 *write_inode = true;
956         }
957 
958         ret = 0;
959 fsck_err:
960         bch2_trans_iter_exit(trans, &dirent_iter);
961         printbuf_exit(&buf);
962         bch_err_fn(c, ret);
963         return ret;
964 }
965 
966 static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
967 {
968         subvol_inum inum = {
969                 .subvol = snapshot_t(c, p.snapshot)->subvol,
970                 .inum   = p.offset,
971         };
972 
973         /* snapshot tree corruption, can't safely delete */
974         if (!inum.subvol) {
975                 bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
976                 return true;
977         }
978 
979         return __bch2_inode_hash_find(c, inum) != NULL;
980 }
981 
982 static int check_inode(struct btree_trans *trans,
983                        struct btree_iter *iter,
984                        struct bkey_s_c k,
985                        struct bch_inode_unpacked *prev,
986                        struct snapshots_seen *s,
987                        bool full)
988 {
989         struct bch_fs *c = trans->c;
990         struct bch_inode_unpacked u;
991         bool do_update = false;
992         int ret;
993 
994         ret = bch2_check_key_has_snapshot(trans, iter, k);
995         if (ret < 0)
996                 goto err;
997         if (ret)
998                 return 0;
999 
1000         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1001         if (ret)
1002                 goto err;
1003 
1004         if (!bkey_is_inode(k.k))
1005                 return 0;
1006 
1007         BUG_ON(bch2_inode_unpack(k, &u));
1008 
1009         if (!full &&
1010             !(u.bi_flags & (BCH_INODE_i_size_dirty|
1011                             BCH_INODE_i_sectors_dirty|
1012                             BCH_INODE_unlinked)))
1013                 return 0;
1014 
1015         if (prev->bi_inum != u.bi_inum)
1016                 *prev = u;
1017 
1018         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
1019                         inode_d_type(prev)      != inode_d_type(&u),
1020                         trans, inode_snapshot_mismatch,
1021                         "inodes in different snapshots don't match")) {
1022                 bch_err(c, "repair not implemented yet");
1023                 return -BCH_ERR_fsck_repair_unimplemented;
1024         }
1025 
1026         if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1027             bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1028                 struct bpos new_min_pos;
1029 
1030                 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1031                 if (ret)
1032                         goto err;
1033 
1034                 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1035 
1036                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1037 
1038                 bch_err_msg(c, ret, "in fsck updating inode");
1039                 if (ret)
1040                         return ret;
1041 
1042                 if (!bpos_eq(new_min_pos, POS_MIN))
1043                         bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1044                 return 0;
1045         }
1046 
1047         if (u.bi_flags & BCH_INODE_unlinked) {
1048                 ret = check_inode_deleted_list(trans, k.k->p);
1049                 if (ret < 0)
1050                         return ret;
1051 
1052                 fsck_err_on(!ret,
1053                             trans, unlinked_inode_not_on_deleted_list,
1054                             "inode %llu:%u unlinked, but not on deleted list",
1055                             u.bi_inum, k.k->p.snapshot);
1056                 ret = 0;
1057         }
1058 
1059         if (u.bi_flags & BCH_INODE_unlinked &&
1060             !bch2_inode_open(c, k.k->p) &&
1061             (!c->sb.clean ||
1062              fsck_err(trans, inode_unlinked_but_clean,
1063                       "filesystem marked clean, but inode %llu unlinked",
1064                       u.bi_inum))) {
1065                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1066                 bch_err_msg(c, ret, "in fsck deleting inode");
1067                 return ret;
1068         }
1069 
1070         if (u.bi_flags & BCH_INODE_i_size_dirty &&
1071             (!c->sb.clean ||
1072              fsck_err(trans, inode_i_size_dirty_but_clean,
1073                       "filesystem marked clean, but inode %llu has i_size dirty",
1074                       u.bi_inum))) {
1075                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
1076 
1077                 /*
1078                  * XXX: need to truncate partial blocks too here - or ideally
1079                  * just switch units to bytes and that issue goes away
1080                  */
1081                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1082                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1083                                      iter->pos.snapshot),
1084                                 POS(u.bi_inum, U64_MAX),
1085                                 0, NULL);
1086                 bch_err_msg(c, ret, "in fsck truncating inode");
1087                 if (ret)
1088                         return ret;
1089 
1090                 /*
1091                  * We truncated without our normal sector accounting hook, just
1092                  * make sure we recalculate it:
1093                  */
1094                 u.bi_flags |= BCH_INODE_i_sectors_dirty;
1095 
1096                 u.bi_flags &= ~BCH_INODE_i_size_dirty;
1097                 do_update = true;
1098         }
1099 
1100         if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1101             (!c->sb.clean ||
1102              fsck_err(trans, inode_i_sectors_dirty_but_clean,
1103                       "filesystem marked clean, but inode %llu has i_sectors dirty",
1104                       u.bi_inum))) {
1105                 s64 sectors;
1106 
1107                 bch_verbose(c, "recounting sectors for inode %llu",
1108                             u.bi_inum);
1109 
1110                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1111                 if (sectors < 0) {
1112                         bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1113                         return sectors;
1114                 }
1115 
1116                 u.bi_sectors = sectors;
1117                 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1118                 do_update = true;
1119         }
1120 
1121         if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1122                 u.bi_dir = 0;
1123                 u.bi_dir_offset = 0;
1124                 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1125                 do_update = true;
1126         }
1127 
1128         if (u.bi_dir || u.bi_dir_offset) {
1129                 ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1130                 if (ret)
1131                         goto err;
1132         }
1133 
1134         if (fsck_err_on(u.bi_parent_subvol &&
1135                         (u.bi_subvol == 0 ||
1136                          u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1137                         trans, inode_bi_parent_nonzero,
1138                         "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1139                         u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1140                 u.bi_parent_subvol = 0;
1141                 do_update = true;
1142         }
1143 
1144         if (u.bi_subvol) {
1145                 struct bch_subvolume s;
1146 
1147                 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1148                 if (ret && !bch2_err_matches(ret, ENOENT))
1149                         goto err;
1150 
1151                 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1152                         ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1153                         goto do_update;
1154                 }
1155 
1156                 if (fsck_err_on(ret,
1157                                 trans, inode_bi_subvol_missing,
1158                                 "inode %llu:%u bi_subvol points to missing subvolume %u",
1159                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1160                     fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1161                                 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1162                                                            k.k->p.snapshot),
1163                                 trans, inode_bi_subvol_wrong,
1164                                 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1165                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1166                                 le64_to_cpu(s.inode),
1167                                 le32_to_cpu(s.snapshot))) {
1168                         u.bi_subvol = 0;
1169                         u.bi_parent_subvol = 0;
1170                         do_update = true;
1171                 }
1172         }
1173 do_update:
1174         if (do_update) {
1175                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1176                 bch_err_msg(c, ret, "in fsck updating inode");
1177                 if (ret)
1178                         return ret;
1179         }
1180 err:
1181 fsck_err:
1182         bch_err_fn(c, ret);
1183         return ret;
1184 }
1185 
1186 int bch2_check_inodes(struct bch_fs *c)
1187 {
1188         bool full = c->opts.fsck;
1189         struct bch_inode_unpacked prev = { 0 };
1190         struct snapshots_seen s;
1191 
1192         snapshots_seen_init(&s);
1193 
1194         int ret = bch2_trans_run(c,
1195                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1196                                 POS_MIN,
1197                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1198                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1199                         check_inode(trans, &iter, k, &prev, &s, full)));
1200 
1201         snapshots_seen_exit(&s);
1202         bch_err_fn(c, ret);
1203         return ret;
1204 }
1205 
1206 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1207 {
1208         switch (btree) {
1209         case BTREE_ID_extents:
1210                 return S_ISREG(mode) || S_ISLNK(mode);
1211         case BTREE_ID_dirents:
1212                 return S_ISDIR(mode);
1213         case BTREE_ID_xattrs:
1214                 return true;
1215         default:
1216                 BUG();
1217         }
1218 }
1219 
1220 static int check_key_has_inode(struct btree_trans *trans,
1221                                struct btree_iter *iter,
1222                                struct inode_walker *inode,
1223                                struct inode_walker_entry *i,
1224                                struct bkey_s_c k)
1225 {
1226         struct bch_fs *c = trans->c;
1227         struct printbuf buf = PRINTBUF;
1228         int ret = PTR_ERR_OR_ZERO(i);
1229         if (ret)
1230                 return ret;
1231 
1232         if (k.k->type == KEY_TYPE_whiteout)
1233                 goto out;
1234 
1235         if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1236                 ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1237                         bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1238                 if (ret)
1239                         goto err;
1240 
1241                 inode->last_pos.inode--;
1242                 ret = -BCH_ERR_transaction_restart_nested;
1243                 goto err;
1244         }
1245 
1246         if (fsck_err_on(!i,
1247                         trans, key_in_missing_inode,
1248                         "key in missing inode:\n  %s",
1249                         (printbuf_reset(&buf),
1250                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1251                 goto delete;
1252 
1253         if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1254                         trans, key_in_wrong_inode_type,
1255                         "key for wrong inode mode %o:\n  %s",
1256                         i->inode.bi_mode,
1257                         (printbuf_reset(&buf),
1258                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1259                 goto delete;
1260 out:
1261 err:
1262 fsck_err:
1263         printbuf_exit(&buf);
1264         bch_err_fn(c, ret);
1265         return ret;
1266 delete:
1267         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1268         goto out;
1269 }
1270 
1271 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1272 {
1273         struct bch_fs *c = trans->c;
1274         int ret = 0;
1275         s64 count2;
1276 
1277         darray_for_each(w->inodes, i) {
1278                 if (i->inode.bi_sectors == i->count)
1279                         continue;
1280 
1281                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1282 
1283                 if (w->recalculate_sums)
1284                         i->count = count2;
1285 
1286                 if (i->count != count2) {
1287                         bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1288                                             w->last_pos.inode, i->snapshot, i->count, count2);
1289                         return -BCH_ERR_internal_fsck_err;
1290                 }
1291 
1292                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1293                                 trans, inode_i_sectors_wrong,
1294                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1295                                 w->last_pos.inode, i->snapshot,
1296                                 i->inode.bi_sectors, i->count)) {
1297                         i->inode.bi_sectors = i->count;
1298                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1299                         if (ret)
1300                                 break;
1301                 }
1302         }
1303 fsck_err:
1304         bch_err_fn(c, ret);
1305         return ret;
1306 }
1307 
1308 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1309 {
1310         u32 restart_count = trans->restart_count;
1311         return check_i_sectors_notnested(trans, w) ?:
1312                 trans_was_restarted(trans, restart_count);
1313 }
1314 
1315 struct extent_end {
1316         u32                     snapshot;
1317         u64                     offset;
1318         struct snapshots_seen   seen;
1319 };
1320 
1321 struct extent_ends {
1322         struct bpos                     last_pos;
1323         DARRAY(struct extent_end)       e;
1324 };
1325 
1326 static void extent_ends_reset(struct extent_ends *extent_ends)
1327 {
1328         darray_for_each(extent_ends->e, i)
1329                 snapshots_seen_exit(&i->seen);
1330         extent_ends->e.nr = 0;
1331 }
1332 
1333 static void extent_ends_exit(struct extent_ends *extent_ends)
1334 {
1335         extent_ends_reset(extent_ends);
1336         darray_exit(&extent_ends->e);
1337 }
1338 
1339 static void extent_ends_init(struct extent_ends *extent_ends)
1340 {
1341         memset(extent_ends, 0, sizeof(*extent_ends));
1342 }
1343 
1344 static int extent_ends_at(struct bch_fs *c,
1345                           struct extent_ends *extent_ends,
1346                           struct snapshots_seen *seen,
1347                           struct bkey_s_c k)
1348 {
1349         struct extent_end *i, n = (struct extent_end) {
1350                 .offset         = k.k->p.offset,
1351                 .snapshot       = k.k->p.snapshot,
1352                 .seen           = *seen,
1353         };
1354 
1355         n.seen.ids.data = kmemdup(seen->ids.data,
1356                               sizeof(seen->ids.data[0]) * seen->ids.size,
1357                               GFP_KERNEL);
1358         if (!n.seen.ids.data)
1359                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1360 
1361         __darray_for_each(extent_ends->e, i) {
1362                 if (i->snapshot == k.k->p.snapshot) {
1363                         snapshots_seen_exit(&i->seen);
1364                         *i = n;
1365                         return 0;
1366                 }
1367 
1368                 if (i->snapshot >= k.k->p.snapshot)
1369                         break;
1370         }
1371 
1372         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1373 }
1374 
1375 static int overlapping_extents_found(struct btree_trans *trans,
1376                                      enum btree_id btree,
1377                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1378                                      struct bkey pos2,
1379                                      bool *fixed,
1380                                      struct extent_end *extent_end)
1381 {
1382         struct bch_fs *c = trans->c;
1383         struct printbuf buf = PRINTBUF;
1384         struct btree_iter iter1, iter2 = { NULL };
1385         struct bkey_s_c k1, k2;
1386         int ret;
1387 
1388         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1389 
1390         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1391                              BTREE_ITER_all_snapshots|
1392                              BTREE_ITER_not_extents);
1393         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1394         ret = bkey_err(k1);
1395         if (ret)
1396                 goto err;
1397 
1398         prt_str(&buf, "\n  ");
1399         bch2_bkey_val_to_text(&buf, c, k1);
1400 
1401         if (!bpos_eq(pos1, k1.k->p)) {
1402                 prt_str(&buf, "\n  wanted\n  ");
1403                 bch2_bpos_to_text(&buf, pos1);
1404                 prt_str(&buf, "\n  ");
1405                 bch2_bkey_to_text(&buf, &pos2);
1406 
1407                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1408                         __func__, buf.buf);
1409                 ret = -BCH_ERR_internal_fsck_err;
1410                 goto err;
1411         }
1412 
1413         bch2_trans_copy_iter(&iter2, &iter1);
1414 
1415         while (1) {
1416                 bch2_btree_iter_advance(&iter2);
1417 
1418                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1419                 ret = bkey_err(k2);
1420                 if (ret)
1421                         goto err;
1422 
1423                 if (bpos_ge(k2.k->p, pos2.p))
1424                         break;
1425         }
1426 
1427         prt_str(&buf, "\n  ");
1428         bch2_bkey_val_to_text(&buf, c, k2);
1429 
1430         if (bpos_gt(k2.k->p, pos2.p) ||
1431             pos2.size != k2.k->size) {
1432                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1433                         __func__, buf.buf);
1434                 ret = -BCH_ERR_internal_fsck_err;
1435                 goto err;
1436         }
1437 
1438         prt_printf(&buf, "\n  overwriting %s extent",
1439                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1440 
1441         if (fsck_err(trans, extent_overlapping,
1442                      "overlapping extents%s", buf.buf)) {
1443                 struct btree_iter *old_iter = &iter1;
1444                 struct disk_reservation res = { 0 };
1445 
1446                 if (pos1.snapshot < pos2.p.snapshot) {
1447                         old_iter = &iter2;
1448                         swap(k1, k2);
1449                 }
1450 
1451                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1452 
1453                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1454                                 BTREE_UPDATE_internal_snapshot_node,
1455                                 k1, k2) ?:
1456                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1457                 bch2_disk_reservation_put(c, &res);
1458 
1459                 if (ret)
1460                         goto err;
1461 
1462                 *fixed = true;
1463 
1464                 if (pos1.snapshot == pos2.p.snapshot) {
1465                         /*
1466                          * We overwrote the first extent, and did the overwrite
1467                          * in the same snapshot:
1468                          */
1469                         extent_end->offset = bkey_start_offset(&pos2);
1470                 } else if (pos1.snapshot > pos2.p.snapshot) {
1471                         /*
1472                          * We overwrote the first extent in pos2's snapshot:
1473                          */
1474                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1475                 } else {
1476                         /*
1477                          * We overwrote the second extent - restart
1478                          * check_extent() from the top:
1479                          */
1480                         ret = -BCH_ERR_transaction_restart_nested;
1481                 }
1482         }
1483 fsck_err:
1484 err:
1485         bch2_trans_iter_exit(trans, &iter2);
1486         bch2_trans_iter_exit(trans, &iter1);
1487         printbuf_exit(&buf);
1488         return ret;
1489 }
1490 
1491 static int check_overlapping_extents(struct btree_trans *trans,
1492                               struct snapshots_seen *seen,
1493                               struct extent_ends *extent_ends,
1494                               struct bkey_s_c k,
1495                               struct btree_iter *iter,
1496                               bool *fixed)
1497 {
1498         struct bch_fs *c = trans->c;
1499         int ret = 0;
1500 
1501         /* transaction restart, running again */
1502         if (bpos_eq(extent_ends->last_pos, k.k->p))
1503                 return 0;
1504 
1505         if (extent_ends->last_pos.inode != k.k->p.inode)
1506                 extent_ends_reset(extent_ends);
1507 
1508         darray_for_each(extent_ends->e, i) {
1509                 if (i->offset <= bkey_start_offset(k.k))
1510                         continue;
1511 
1512                 if (!ref_visible2(c,
1513                                   k.k->p.snapshot, seen,
1514                                   i->snapshot, &i->seen))
1515                         continue;
1516 
1517                 ret = overlapping_extents_found(trans, iter->btree_id,
1518                                                 SPOS(iter->pos.inode,
1519                                                      i->offset,
1520                                                      i->snapshot),
1521                                                 &i->seen,
1522                                                 *k.k, fixed, i);
1523                 if (ret)
1524                         goto err;
1525         }
1526 
1527         extent_ends->last_pos = k.k->p;
1528 err:
1529         return ret;
1530 }
1531 
1532 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1533                                 struct bkey_s_c k)
1534 {
1535         struct bch_fs *c = trans->c;
1536         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1537         struct bch_extent_crc_unpacked crc;
1538         const union bch_extent_entry *i;
1539         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1540 
1541         bkey_for_each_crc(k.k, ptrs, crc, i)
1542                 if (crc_is_encoded(crc) &&
1543                     crc.uncompressed_size > encoded_extent_max_sectors) {
1544                         struct printbuf buf = PRINTBUF;
1545 
1546                         bch2_bkey_val_to_text(&buf, c, k);
1547                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1548                         printbuf_exit(&buf);
1549                 }
1550 
1551         return 0;
1552 }
1553 
1554 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1555                         struct bkey_s_c k,
1556                         struct inode_walker *inode,
1557                         struct snapshots_seen *s,
1558                         struct extent_ends *extent_ends)
1559 {
1560         struct bch_fs *c = trans->c;
1561         struct inode_walker_entry *i;
1562         struct printbuf buf = PRINTBUF;
1563         int ret = 0;
1564 
1565         ret = bch2_check_key_has_snapshot(trans, iter, k);
1566         if (ret) {
1567                 ret = ret < 0 ? ret : 0;
1568                 goto out;
1569         }
1570 
1571         if (inode->last_pos.inode != k.k->p.inode) {
1572                 ret = check_i_sectors(trans, inode);
1573                 if (ret)
1574                         goto err;
1575         }
1576 
1577         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1578         if (ret)
1579                 goto err;
1580 
1581         i = walk_inode(trans, inode, k);
1582         ret = PTR_ERR_OR_ZERO(i);
1583         if (ret)
1584                 goto err;
1585 
1586         ret = check_key_has_inode(trans, iter, inode, i, k);
1587         if (ret)
1588                 goto err;
1589 
1590         if (k.k->type != KEY_TYPE_whiteout) {
1591                 ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1592                                                 &inode->recalculate_sums);
1593                 if (ret)
1594                         goto err;
1595         }
1596 
1597         /*
1598          * Check inodes in reverse order, from oldest snapshots to newest,
1599          * starting from the inode that matches this extent's snapshot. If we
1600          * didn't have one, iterate over all inodes:
1601          */
1602         if (!i)
1603                 i = &darray_last(inode->inodes);
1604 
1605         for (;
1606              inode->inodes.data && i >= inode->inodes.data;
1607              --i) {
1608                 if (i->snapshot > k.k->p.snapshot ||
1609                     !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1610                         continue;
1611 
1612                 if (k.k->type != KEY_TYPE_whiteout) {
1613                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1614                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1615                                         !bkey_extent_is_reservation(k),
1616                                         trans, extent_past_end_of_inode,
1617                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1618                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1619                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1620                                 struct btree_iter iter2;
1621 
1622                                 bch2_trans_copy_iter(&iter2, iter);
1623                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1624                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1625                                         bch2_btree_delete_at(trans, &iter2,
1626                                                 BTREE_UPDATE_internal_snapshot_node);
1627                                 bch2_trans_iter_exit(trans, &iter2);
1628                                 if (ret)
1629                                         goto err;
1630 
1631                                 iter->k.type = KEY_TYPE_whiteout;
1632                         }
1633 
1634                         if (bkey_extent_is_allocation(k.k))
1635                                 i->count += k.k->size;
1636                 }
1637 
1638                 i->seen_this_pos = true;
1639         }
1640 
1641         if (k.k->type != KEY_TYPE_whiteout) {
1642                 ret = extent_ends_at(c, extent_ends, s, k);
1643                 if (ret)
1644                         goto err;
1645         }
1646 out:
1647 err:
1648 fsck_err:
1649         printbuf_exit(&buf);
1650         bch_err_fn(c, ret);
1651         return ret;
1652 }
1653 
1654 /*
1655  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1656  * that i_size an i_sectors are consistent
1657  */
1658 int bch2_check_extents(struct bch_fs *c)
1659 {
1660         struct inode_walker w = inode_walker_init();
1661         struct snapshots_seen s;
1662         struct extent_ends extent_ends;
1663         struct disk_reservation res = { 0 };
1664 
1665         snapshots_seen_init(&s);
1666         extent_ends_init(&extent_ends);
1667 
1668         int ret = bch2_trans_run(c,
1669                 for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1670                                 POS(BCACHEFS_ROOT_INO, 0),
1671                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1672                                 &res, NULL,
1673                                 BCH_TRANS_COMMIT_no_enospc, ({
1674                         bch2_disk_reservation_put(c, &res);
1675                         check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1676                         check_extent_overbig(trans, &iter, k);
1677                 })) ?:
1678                 check_i_sectors_notnested(trans, &w));
1679 
1680         bch2_disk_reservation_put(c, &res);
1681         extent_ends_exit(&extent_ends);
1682         inode_walker_exit(&w);
1683         snapshots_seen_exit(&s);
1684 
1685         bch_err_fn(c, ret);
1686         return ret;
1687 }
1688 
1689 int bch2_check_indirect_extents(struct bch_fs *c)
1690 {
1691         struct disk_reservation res = { 0 };
1692 
1693         int ret = bch2_trans_run(c,
1694                 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1695                                 POS_MIN,
1696                                 BTREE_ITER_prefetch, k,
1697                                 &res, NULL,
1698                                 BCH_TRANS_COMMIT_no_enospc, ({
1699                         bch2_disk_reservation_put(c, &res);
1700                         check_extent_overbig(trans, &iter, k);
1701                 })));
1702 
1703         bch2_disk_reservation_put(c, &res);
1704         bch_err_fn(c, ret);
1705         return ret;
1706 }
1707 
1708 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1709 {
1710         struct bch_fs *c = trans->c;
1711         int ret = 0;
1712         s64 count2;
1713 
1714         darray_for_each(w->inodes, i) {
1715                 if (i->inode.bi_nlink == i->count)
1716                         continue;
1717 
1718                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1719                 if (count2 < 0)
1720                         return count2;
1721 
1722                 if (i->count != count2) {
1723                         bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1724                                             w->last_pos.inode, i->snapshot, i->count, count2);
1725                         i->count = count2;
1726                         if (i->inode.bi_nlink == i->count)
1727                                 continue;
1728                 }
1729 
1730                 if (fsck_err_on(i->inode.bi_nlink != i->count,
1731                                 trans, inode_dir_wrong_nlink,
1732                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1733                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1734                         i->inode.bi_nlink = i->count;
1735                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1736                         if (ret)
1737                                 break;
1738                 }
1739         }
1740 fsck_err:
1741         bch_err_fn(c, ret);
1742         return ret;
1743 }
1744 
1745 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1746 {
1747         u32 restart_count = trans->restart_count;
1748         return check_subdir_count_notnested(trans, w) ?:
1749                 trans_was_restarted(trans, restart_count);
1750 }
1751 
1752 noinline_for_stack
1753 static int check_dirent_inode_dirent(struct btree_trans *trans,
1754                                    struct btree_iter *iter,
1755                                    struct bkey_s_c_dirent d,
1756                                    struct bch_inode_unpacked *target,
1757                                    u32 target_snapshot)
1758 {
1759         struct bch_fs *c = trans->c;
1760         struct printbuf buf = PRINTBUF;
1761         int ret = 0;
1762 
1763         if (inode_points_to_dirent(target, d))
1764                 return 0;
1765 
1766         if (bch2_inode_should_have_bp(target) &&
1767             !fsck_err(trans, inode_wrong_backpointer,
1768                       "dirent points to inode that does not point back:\n  %s",
1769                       (bch2_bkey_val_to_text(&buf, c, d.s_c),
1770                        prt_printf(&buf, "\n  "),
1771                        bch2_inode_unpacked_to_text(&buf, target),
1772                        buf.buf)))
1773                 goto out_noiter;
1774 
1775         if (!target->bi_dir &&
1776             !target->bi_dir_offset) {
1777                 target->bi_dir          = d.k->p.inode;
1778                 target->bi_dir_offset   = d.k->p.offset;
1779                 return __bch2_fsck_write_inode(trans, target, target_snapshot);
1780         }
1781 
1782         struct btree_iter bp_iter = { NULL };
1783         struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1784                               SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1785         ret = bkey_err(bp_dirent);
1786         if (ret && !bch2_err_matches(ret, ENOENT))
1787                 goto err;
1788 
1789         bool backpointer_exists = !ret;
1790         ret = 0;
1791 
1792         if (fsck_err_on(!backpointer_exists,
1793                         trans, inode_wrong_backpointer,
1794                         "inode %llu:%u has wrong backpointer:\n"
1795                         "got       %llu:%llu\n"
1796                         "should be %llu:%llu",
1797                         target->bi_inum, target_snapshot,
1798                         target->bi_dir,
1799                         target->bi_dir_offset,
1800                         d.k->p.inode,
1801                         d.k->p.offset)) {
1802                 target->bi_dir          = d.k->p.inode;
1803                 target->bi_dir_offset   = d.k->p.offset;
1804                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1805                 goto out;
1806         }
1807 
1808         bch2_bkey_val_to_text(&buf, c, d.s_c);
1809         prt_newline(&buf);
1810         if (backpointer_exists)
1811                 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1812 
1813         if (fsck_err_on(backpointer_exists &&
1814                         (S_ISDIR(target->bi_mode) ||
1815                          target->bi_subvol),
1816                         trans, inode_dir_multiple_links,
1817                         "%s %llu:%u with multiple links\n%s",
1818                         S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1819                         target->bi_inum, target_snapshot, buf.buf)) {
1820                 ret = __remove_dirent(trans, d.k->p);
1821                 goto out;
1822         }
1823 
1824         /*
1825          * hardlinked file with nlink 0:
1826          * We're just adjusting nlink here so check_nlinks() will pick
1827          * it up, it ignores inodes with nlink 0
1828          */
1829         if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1830                         trans, inode_multiple_links_but_nlink_0,
1831                         "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1832                         target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1833                 target->bi_nlink++;
1834                 target->bi_flags &= ~BCH_INODE_unlinked;
1835                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1836                 if (ret)
1837                         goto err;
1838         }
1839 out:
1840 err:
1841 fsck_err:
1842         bch2_trans_iter_exit(trans, &bp_iter);
1843 out_noiter:
1844         printbuf_exit(&buf);
1845         bch_err_fn(c, ret);
1846         return ret;
1847 }
1848 
1849 noinline_for_stack
1850 static int check_dirent_target(struct btree_trans *trans,
1851                                struct btree_iter *iter,
1852                                struct bkey_s_c_dirent d,
1853                                struct bch_inode_unpacked *target,
1854                                u32 target_snapshot)
1855 {
1856         struct bch_fs *c = trans->c;
1857         struct bkey_i_dirent *n;
1858         struct printbuf buf = PRINTBUF;
1859         int ret = 0;
1860 
1861         ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1862         if (ret)
1863                 goto err;
1864 
1865         if (fsck_err_on(d.v->d_type != inode_d_type(target),
1866                         trans, dirent_d_type_wrong,
1867                         "incorrect d_type: got %s, should be %s:\n%s",
1868                         bch2_d_type_str(d.v->d_type),
1869                         bch2_d_type_str(inode_d_type(target)),
1870                         (printbuf_reset(&buf),
1871                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1872                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1873                 ret = PTR_ERR_OR_ZERO(n);
1874                 if (ret)
1875                         goto err;
1876 
1877                 bkey_reassemble(&n->k_i, d.s_c);
1878                 n->v.d_type = inode_d_type(target);
1879                 if (n->v.d_type == DT_SUBVOL) {
1880                         n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1881                         n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1882                 } else {
1883                         n->v.d_inum = cpu_to_le64(target->bi_inum);
1884                 }
1885 
1886                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1887                 if (ret)
1888                         goto err;
1889 
1890                 d = dirent_i_to_s_c(n);
1891         }
1892 err:
1893 fsck_err:
1894         printbuf_exit(&buf);
1895         bch_err_fn(c, ret);
1896         return ret;
1897 }
1898 
1899 /* find a subvolume that's a descendent of @snapshot: */
1900 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1901 {
1902         struct btree_iter iter;
1903         struct bkey_s_c k;
1904         int ret;
1905 
1906         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1907                 if (k.k->type != KEY_TYPE_subvolume)
1908                         continue;
1909 
1910                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1911                 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1912                         bch2_trans_iter_exit(trans, &iter);
1913                         *subvolid = k.k->p.offset;
1914                         goto found;
1915                 }
1916         }
1917         if (!ret)
1918                 ret = -ENOENT;
1919 found:
1920         bch2_trans_iter_exit(trans, &iter);
1921         return ret;
1922 }
1923 
1924 noinline_for_stack
1925 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1926                                   struct bkey_s_c_dirent d)
1927 {
1928         struct bch_fs *c = trans->c;
1929         struct btree_iter subvol_iter = {};
1930         struct bch_inode_unpacked subvol_root;
1931         u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1932         u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1933         u32 parent_snapshot;
1934         u32 new_parent_subvol = 0;
1935         u64 parent_inum;
1936         struct printbuf buf = PRINTBUF;
1937         int ret = 0;
1938 
1939         ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1940         if (ret && !bch2_err_matches(ret, ENOENT))
1941                 return ret;
1942 
1943         if (ret ||
1944             (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1945                 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1946                 if (ret2 && !bch2_err_matches(ret, ENOENT))
1947                         return ret2;
1948         }
1949 
1950         if (ret &&
1951             !new_parent_subvol &&
1952             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1953                 /*
1954                  * Couldn't find a subvol for dirent's snapshot - but we lost
1955                  * subvols, so we need to reconstruct:
1956                  */
1957                 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1958                 if (ret)
1959                         return ret;
1960 
1961                 parent_snapshot = d.k->p.snapshot;
1962         }
1963 
1964         if (fsck_err_on(ret,
1965                         trans, dirent_to_missing_parent_subvol,
1966                         "dirent parent_subvol points to missing subvolume\n%s",
1967                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1968             fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1969                         trans, dirent_not_visible_in_parent_subvol,
1970                         "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1971                         parent_snapshot,
1972                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1973                 if (!new_parent_subvol) {
1974                         bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1975                         return -BCH_ERR_fsck_repair_unimplemented;
1976                 }
1977 
1978                 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1979                 ret = PTR_ERR_OR_ZERO(new_dirent);
1980                 if (ret)
1981                         goto err;
1982 
1983                 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1984         }
1985 
1986         struct bkey_s_c_subvolume s =
1987                 bch2_bkey_get_iter_typed(trans, &subvol_iter,
1988                                          BTREE_ID_subvolumes, POS(0, target_subvol),
1989                                          0, subvolume);
1990         ret = bkey_err(s.s_c);
1991         if (ret && !bch2_err_matches(ret, ENOENT))
1992                 return ret;
1993 
1994         if (ret) {
1995                 if (fsck_err(trans, dirent_to_missing_subvol,
1996                              "dirent points to missing subvolume\n%s",
1997                              (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1998                         return __remove_dirent(trans, d.k->p);
1999                 ret = 0;
2000                 goto out;
2001         }
2002 
2003         if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2004                         trans, subvol_fs_path_parent_wrong,
2005                         "subvol with wrong fs_path_parent, should be be %u\n%s",
2006                         parent_subvol,
2007                         (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2008                 struct bkey_i_subvolume *n =
2009                         bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2010                 ret = PTR_ERR_OR_ZERO(n);
2011                 if (ret)
2012                         goto err;
2013 
2014                 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2015         }
2016 
2017         u64 target_inum = le64_to_cpu(s.v->inode);
2018         u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2019 
2020         ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
2021         if (ret && !bch2_err_matches(ret, ENOENT))
2022                 goto err;
2023 
2024         if (ret) {
2025                 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2026                 ret = -BCH_ERR_fsck_repair_unimplemented;
2027                 goto err;
2028         }
2029 
2030         if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2031                         trans, inode_bi_parent_wrong,
2032                         "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2033                         target_inum,
2034                         subvol_root.bi_parent_subvol, parent_subvol)) {
2035                 subvol_root.bi_parent_subvol = parent_subvol;
2036                 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2037                 if (ret)
2038                         goto err;
2039         }
2040 
2041         ret = check_dirent_target(trans, iter, d, &subvol_root,
2042                                   target_snapshot);
2043         if (ret)
2044                 goto err;
2045 out:
2046 err:
2047 fsck_err:
2048         bch2_trans_iter_exit(trans, &subvol_iter);
2049         printbuf_exit(&buf);
2050         return ret;
2051 }
2052 
2053 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2054                         struct bkey_s_c k,
2055                         struct bch_hash_info *hash_info,
2056                         struct inode_walker *dir,
2057                         struct inode_walker *target,
2058                         struct snapshots_seen *s)
2059 {
2060         struct bch_fs *c = trans->c;
2061         struct inode_walker_entry *i;
2062         struct printbuf buf = PRINTBUF;
2063         int ret = 0;
2064 
2065         ret = bch2_check_key_has_snapshot(trans, iter, k);
2066         if (ret) {
2067                 ret = ret < 0 ? ret : 0;
2068                 goto out;
2069         }
2070 
2071         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2072         if (ret)
2073                 goto err;
2074 
2075         if (k.k->type == KEY_TYPE_whiteout)
2076                 goto out;
2077 
2078         if (dir->last_pos.inode != k.k->p.inode) {
2079                 ret = check_subdir_count(trans, dir);
2080                 if (ret)
2081                         goto err;
2082         }
2083 
2084         i = walk_inode(trans, dir, k);
2085         ret = PTR_ERR_OR_ZERO(i);
2086         if (ret < 0)
2087                 goto err;
2088 
2089         ret = check_key_has_inode(trans, iter, dir, i, k);
2090         if (ret)
2091                 goto err;
2092 
2093         if (!i)
2094                 goto out;
2095 
2096         if (dir->first_this_inode)
2097                 *hash_info = bch2_hash_info_init(c, &i->inode);
2098         dir->first_this_inode = false;
2099 
2100         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2101         if (ret < 0)
2102                 goto err;
2103         if (ret) {
2104                 /* dirent has been deleted */
2105                 ret = 0;
2106                 goto out;
2107         }
2108 
2109         if (k.k->type != KEY_TYPE_dirent)
2110                 goto out;
2111 
2112         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2113 
2114         if (d.v->d_type == DT_SUBVOL) {
2115                 ret = check_dirent_to_subvol(trans, iter, d);
2116                 if (ret)
2117                         goto err;
2118         } else {
2119                 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2120                 if (ret)
2121                         goto err;
2122 
2123                 if (fsck_err_on(!target->inodes.nr,
2124                                 trans, dirent_to_missing_inode,
2125                                 "dirent points to missing inode:\n%s",
2126                                 (printbuf_reset(&buf),
2127                                  bch2_bkey_val_to_text(&buf, c, k),
2128                                  buf.buf))) {
2129                         ret = __remove_dirent(trans, d.k->p);
2130                         if (ret)
2131                                 goto err;
2132                 }
2133 
2134                 darray_for_each(target->inodes, i) {
2135                         ret = check_dirent_target(trans, iter, d,
2136                                                   &i->inode, i->snapshot);
2137                         if (ret)
2138                                 goto err;
2139                 }
2140 
2141                 if (d.v->d_type == DT_DIR)
2142                         for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2143                                 i->count++;
2144         }
2145 out:
2146 err:
2147 fsck_err:
2148         printbuf_exit(&buf);
2149         bch_err_fn(c, ret);
2150         return ret;
2151 }
2152 
2153 /*
2154  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2155  * validate d_type
2156  */
2157 int bch2_check_dirents(struct bch_fs *c)
2158 {
2159         struct inode_walker dir = inode_walker_init();
2160         struct inode_walker target = inode_walker_init();
2161         struct snapshots_seen s;
2162         struct bch_hash_info hash_info;
2163 
2164         snapshots_seen_init(&s);
2165 
2166         int ret = bch2_trans_run(c,
2167                 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2168                                 POS(BCACHEFS_ROOT_INO, 0),
2169                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2170                                 k,
2171                                 NULL, NULL,
2172                                 BCH_TRANS_COMMIT_no_enospc,
2173                         check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2174                 check_subdir_count_notnested(trans, &dir));
2175 
2176         snapshots_seen_exit(&s);
2177         inode_walker_exit(&dir);
2178         inode_walker_exit(&target);
2179         bch_err_fn(c, ret);
2180         return ret;
2181 }
2182 
2183 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2184                        struct bkey_s_c k,
2185                        struct bch_hash_info *hash_info,
2186                        struct inode_walker *inode)
2187 {
2188         struct bch_fs *c = trans->c;
2189         struct inode_walker_entry *i;
2190         int ret;
2191 
2192         ret = bch2_check_key_has_snapshot(trans, iter, k);
2193         if (ret < 0)
2194                 return ret;
2195         if (ret)
2196                 return 0;
2197 
2198         i = walk_inode(trans, inode, k);
2199         ret = PTR_ERR_OR_ZERO(i);
2200         if (ret)
2201                 return ret;
2202 
2203         ret = check_key_has_inode(trans, iter, inode, i, k);
2204         if (ret)
2205                 return ret;
2206 
2207         if (!i)
2208                 return 0;
2209 
2210         if (inode->first_this_inode)
2211                 *hash_info = bch2_hash_info_init(c, &i->inode);
2212         inode->first_this_inode = false;
2213 
2214         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2215         bch_err_fn(c, ret);
2216         return ret;
2217 }
2218 
2219 /*
2220  * Walk xattrs: verify that they all have a corresponding inode
2221  */
2222 int bch2_check_xattrs(struct bch_fs *c)
2223 {
2224         struct inode_walker inode = inode_walker_init();
2225         struct bch_hash_info hash_info;
2226         int ret = 0;
2227 
2228         ret = bch2_trans_run(c,
2229                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2230                         POS(BCACHEFS_ROOT_INO, 0),
2231                         BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2232                         k,
2233                         NULL, NULL,
2234                         BCH_TRANS_COMMIT_no_enospc,
2235                 check_xattr(trans, &iter, k, &hash_info, &inode)));
2236 
2237         inode_walker_exit(&inode);
2238         bch_err_fn(c, ret);
2239         return ret;
2240 }
2241 
2242 static int check_root_trans(struct btree_trans *trans)
2243 {
2244         struct bch_fs *c = trans->c;
2245         struct bch_inode_unpacked root_inode;
2246         u32 snapshot;
2247         u64 inum;
2248         int ret;
2249 
2250         ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2251         if (ret && !bch2_err_matches(ret, ENOENT))
2252                 return ret;
2253 
2254         if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2255                                 "root subvol missing")) {
2256                 struct bkey_i_subvolume *root_subvol =
2257                         bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2258                 ret = PTR_ERR_OR_ZERO(root_subvol);
2259                 if (ret)
2260                         goto err;
2261 
2262                 snapshot        = U32_MAX;
2263                 inum            = BCACHEFS_ROOT_INO;
2264 
2265                 bkey_subvolume_init(&root_subvol->k_i);
2266                 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2267                 root_subvol->v.flags    = 0;
2268                 root_subvol->v.snapshot = cpu_to_le32(snapshot);
2269                 root_subvol->v.inode    = cpu_to_le64(inum);
2270                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2271                 bch_err_msg(c, ret, "writing root subvol");
2272                 if (ret)
2273                         goto err;
2274         }
2275 
2276         ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2277         if (ret && !bch2_err_matches(ret, ENOENT))
2278                 return ret;
2279 
2280         if (mustfix_fsck_err_on(ret,
2281                                 trans, root_dir_missing,
2282                                 "root directory missing") ||
2283             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2284                                 trans, root_inode_not_dir,
2285                                 "root inode not a directory")) {
2286                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2287                                 0, NULL);
2288                 root_inode.bi_inum = inum;
2289 
2290                 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2291                 bch_err_msg(c, ret, "writing root inode");
2292         }
2293 err:
2294 fsck_err:
2295         return ret;
2296 }
2297 
2298 /* Get root directory, create if it doesn't exist: */
2299 int bch2_check_root(struct bch_fs *c)
2300 {
2301         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2302                 check_root_trans(trans));
2303         bch_err_fn(c, ret);
2304         return ret;
2305 }
2306 
2307 typedef DARRAY(u32) darray_u32;
2308 
2309 static bool darray_u32_has(darray_u32 *d, u32 v)
2310 {
2311         darray_for_each(*d, i)
2312                 if (*i == v)
2313                         return true;
2314         return false;
2315 }
2316 
2317 /*
2318  * We've checked that inode backpointers point to valid dirents; here, it's
2319  * sufficient to check that the subvolume root has a dirent:
2320  */
2321 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2322 {
2323         struct bch_inode_unpacked inode;
2324         int ret = bch2_inode_find_by_inum_trans(trans,
2325                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2326                                 &inode);
2327         if (ret)
2328                 return ret;
2329 
2330         return inode.bi_dir != 0;
2331 }
2332 
2333 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2334 {
2335         struct bch_fs *c = trans->c;
2336         struct btree_iter parent_iter = {};
2337         darray_u32 subvol_path = {};
2338         struct printbuf buf = PRINTBUF;
2339         int ret = 0;
2340 
2341         if (k.k->type != KEY_TYPE_subvolume)
2342                 return 0;
2343 
2344         while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2345                 ret = darray_push(&subvol_path, k.k->p.offset);
2346                 if (ret)
2347                         goto err;
2348 
2349                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2350 
2351                 ret = subvol_has_dirent(trans, s);
2352                 if (ret < 0)
2353                         break;
2354 
2355                 if (fsck_err_on(!ret,
2356                                 trans, subvol_unreachable,
2357                                 "unreachable subvolume %s",
2358                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2359                                  buf.buf))) {
2360                         ret = reattach_subvol(trans, s);
2361                         break;
2362                 }
2363 
2364                 u32 parent = le32_to_cpu(s.v->fs_path_parent);
2365 
2366                 if (darray_u32_has(&subvol_path, parent)) {
2367                         if (fsck_err(c, subvol_loop, "subvolume loop"))
2368                                 ret = reattach_subvol(trans, s);
2369                         break;
2370                 }
2371 
2372                 bch2_trans_iter_exit(trans, &parent_iter);
2373                 bch2_trans_iter_init(trans, &parent_iter,
2374                                      BTREE_ID_subvolumes, POS(0, parent), 0);
2375                 k = bch2_btree_iter_peek_slot(&parent_iter);
2376                 ret = bkey_err(k);
2377                 if (ret)
2378                         goto err;
2379 
2380                 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2381                                 trans, subvol_unreachable,
2382                                 "unreachable subvolume %s",
2383                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2384                                  buf.buf))) {
2385                         ret = reattach_subvol(trans, s);
2386                         break;
2387                 }
2388         }
2389 fsck_err:
2390 err:
2391         printbuf_exit(&buf);
2392         darray_exit(&subvol_path);
2393         bch2_trans_iter_exit(trans, &parent_iter);
2394         return ret;
2395 }
2396 
2397 int bch2_check_subvolume_structure(struct bch_fs *c)
2398 {
2399         int ret = bch2_trans_run(c,
2400                 for_each_btree_key_commit(trans, iter,
2401                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2402                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2403                         check_subvol_path(trans, &iter, k)));
2404         bch_err_fn(c, ret);
2405         return ret;
2406 }
2407 
2408 struct pathbuf_entry {
2409         u64     inum;
2410         u32     snapshot;
2411 };
2412 
2413 typedef DARRAY(struct pathbuf_entry) pathbuf;
2414 
2415 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2416 {
2417         darray_for_each(*p, i)
2418                 if (i->inum     == inum &&
2419                     i->snapshot == snapshot)
2420                         return true;
2421         return false;
2422 }
2423 
2424 /*
2425  * Check that a given inode is reachable from its subvolume root - we already
2426  * verified subvolume connectivity:
2427  *
2428  * XXX: we should also be verifying that inodes are in the right subvolumes
2429  */
2430 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2431 {
2432         struct bch_fs *c = trans->c;
2433         struct btree_iter inode_iter = {};
2434         struct bch_inode_unpacked inode;
2435         struct printbuf buf = PRINTBUF;
2436         u32 snapshot = inode_k.k->p.snapshot;
2437         int ret = 0;
2438 
2439         p->nr = 0;
2440 
2441         BUG_ON(bch2_inode_unpack(inode_k, &inode));
2442 
2443         while (!inode.bi_subvol) {
2444                 struct btree_iter dirent_iter;
2445                 struct bkey_s_c_dirent d;
2446                 u32 parent_snapshot = snapshot;
2447 
2448                 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2449                 ret = bkey_err(d.s_c);
2450                 if (ret && !bch2_err_matches(ret, ENOENT))
2451                         break;
2452 
2453                 if (!ret && !dirent_points_to_inode(d, &inode)) {
2454                         bch2_trans_iter_exit(trans, &dirent_iter);
2455                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2456                 }
2457 
2458                 if (bch2_err_matches(ret, ENOENT)) {
2459                         ret = 0;
2460                         if (fsck_err(trans, inode_unreachable,
2461                                      "unreachable inode\n%s",
2462                                      (printbuf_reset(&buf),
2463                                       bch2_bkey_val_to_text(&buf, c, inode_k),
2464                                       buf.buf)))
2465                                 ret = reattach_inode(trans, &inode, snapshot);
2466                         goto out;
2467                 }
2468 
2469                 bch2_trans_iter_exit(trans, &dirent_iter);
2470 
2471                 if (!S_ISDIR(inode.bi_mode))
2472                         break;
2473 
2474                 ret = darray_push(p, ((struct pathbuf_entry) {
2475                         .inum           = inode.bi_inum,
2476                         .snapshot       = snapshot,
2477                 }));
2478                 if (ret)
2479                         return ret;
2480 
2481                 snapshot = parent_snapshot;
2482 
2483                 bch2_trans_iter_exit(trans, &inode_iter);
2484                 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2485                                              SPOS(0, inode.bi_dir, snapshot), 0);
2486                 ret = bkey_err(inode_k) ?:
2487                         !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2488                         : bch2_inode_unpack(inode_k, &inode);
2489                 if (ret) {
2490                         /* Should have been caught in dirents pass */
2491                         bch_err_msg(c, ret, "error looking up parent directory");
2492                         break;
2493                 }
2494 
2495                 snapshot = inode_k.k->p.snapshot;
2496 
2497                 if (path_is_dup(p, inode.bi_inum, snapshot)) {
2498                         /* XXX print path */
2499                         bch_err(c, "directory structure loop");
2500 
2501                         darray_for_each(*p, i)
2502                                 pr_err("%llu:%u", i->inum, i->snapshot);
2503                         pr_err("%llu:%u", inode.bi_inum, snapshot);
2504 
2505                         if (fsck_err(trans, dir_loop, "directory structure loop")) {
2506                                 ret = remove_backpointer(trans, &inode);
2507                                 bch_err_msg(c, ret, "removing dirent");
2508                                 if (ret)
2509                                         break;
2510 
2511                                 ret = reattach_inode(trans, &inode, snapshot);
2512                                 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2513                         }
2514                         break;
2515                 }
2516         }
2517 out:
2518 fsck_err:
2519         bch2_trans_iter_exit(trans, &inode_iter);
2520         printbuf_exit(&buf);
2521         bch_err_fn(c, ret);
2522         return ret;
2523 }
2524 
2525 /*
2526  * Check for unreachable inodes, as well as loops in the directory structure:
2527  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2528  * unreachable:
2529  */
2530 int bch2_check_directory_structure(struct bch_fs *c)
2531 {
2532         pathbuf path = { 0, };
2533         int ret;
2534 
2535         ret = bch2_trans_run(c,
2536                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2537                                           BTREE_ITER_intent|
2538                                           BTREE_ITER_prefetch|
2539                                           BTREE_ITER_all_snapshots, k,
2540                                           NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2541                         if (!bkey_is_inode(k.k))
2542                                 continue;
2543 
2544                         if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2545                                 continue;
2546 
2547                         check_path(trans, &path, k);
2548                 })));
2549         darray_exit(&path);
2550 
2551         bch_err_fn(c, ret);
2552         return ret;
2553 }
2554 
2555 struct nlink_table {
2556         size_t          nr;
2557         size_t          size;
2558 
2559         struct nlink {
2560                 u64     inum;
2561                 u32     snapshot;
2562                 u32     count;
2563         }               *d;
2564 };
2565 
2566 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2567                      u64 inum, u32 snapshot)
2568 {
2569         if (t->nr == t->size) {
2570                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2571                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2572 
2573                 if (!d) {
2574                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2575                                 new_size);
2576                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2577                 }
2578 
2579                 if (t->d)
2580                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2581                 kvfree(t->d);
2582 
2583                 t->d = d;
2584                 t->size = new_size;
2585         }
2586 
2587 
2588         t->d[t->nr++] = (struct nlink) {
2589                 .inum           = inum,
2590                 .snapshot       = snapshot,
2591         };
2592 
2593         return 0;
2594 }
2595 
2596 static int nlink_cmp(const void *_l, const void *_r)
2597 {
2598         const struct nlink *l = _l;
2599         const struct nlink *r = _r;
2600 
2601         return cmp_int(l->inum, r->inum);
2602 }
2603 
2604 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2605                      struct nlink_table *links,
2606                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2607 {
2608         struct nlink *link, key = {
2609                 .inum = inum, .snapshot = U32_MAX,
2610         };
2611 
2612         if (inum < range_start || inum >= range_end)
2613                 return;
2614 
2615         link = __inline_bsearch(&key, links->d, links->nr,
2616                                 sizeof(links->d[0]), nlink_cmp);
2617         if (!link)
2618                 return;
2619 
2620         while (link > links->d && link[0].inum == link[-1].inum)
2621                 --link;
2622 
2623         for (; link < links->d + links->nr && link->inum == inum; link++)
2624                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2625                         link->count++;
2626                         if (link->snapshot >= snapshot)
2627                                 break;
2628                 }
2629 }
2630 
2631 noinline_for_stack
2632 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2633                                        struct nlink_table *t,
2634                                        u64 start, u64 *end)
2635 {
2636         int ret = bch2_trans_run(c,
2637                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2638                                    POS(0, start),
2639                                    BTREE_ITER_intent|
2640                                    BTREE_ITER_prefetch|
2641                                    BTREE_ITER_all_snapshots, k, ({
2642                         if (!bkey_is_inode(k.k))
2643                                 continue;
2644 
2645                         /* Should never fail, checked by bch2_inode_invalid: */
2646                         struct bch_inode_unpacked u;
2647                         BUG_ON(bch2_inode_unpack(k, &u));
2648 
2649                         /*
2650                          * Backpointer and directory structure checks are sufficient for
2651                          * directories, since they can't have hardlinks:
2652                          */
2653                         if (S_ISDIR(u.bi_mode))
2654                                 continue;
2655 
2656                         if (!u.bi_nlink)
2657                                 continue;
2658 
2659                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2660                         if (ret) {
2661                                 *end = k.k->p.offset;
2662                                 ret = 0;
2663                                 break;
2664                         }
2665                         0;
2666                 })));
2667 
2668         bch_err_fn(c, ret);
2669         return ret;
2670 }
2671 
2672 noinline_for_stack
2673 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2674                                      u64 range_start, u64 range_end)
2675 {
2676         struct snapshots_seen s;
2677 
2678         snapshots_seen_init(&s);
2679 
2680         int ret = bch2_trans_run(c,
2681                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2682                                    BTREE_ITER_intent|
2683                                    BTREE_ITER_prefetch|
2684                                    BTREE_ITER_all_snapshots, k, ({
2685                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2686                         if (ret)
2687                                 break;
2688 
2689                         if (k.k->type == KEY_TYPE_dirent) {
2690                                 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2691 
2692                                 if (d.v->d_type != DT_DIR &&
2693                                     d.v->d_type != DT_SUBVOL)
2694                                         inc_link(c, &s, links, range_start, range_end,
2695                                                  le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2696                         }
2697                         0;
2698                 })));
2699 
2700         snapshots_seen_exit(&s);
2701 
2702         bch_err_fn(c, ret);
2703         return ret;
2704 }
2705 
2706 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2707                                      struct bkey_s_c k,
2708                                      struct nlink_table *links,
2709                                      size_t *idx, u64 range_end)
2710 {
2711         struct bch_inode_unpacked u;
2712         struct nlink *link = &links->d[*idx];
2713         int ret = 0;
2714 
2715         if (k.k->p.offset >= range_end)
2716                 return 1;
2717 
2718         if (!bkey_is_inode(k.k))
2719                 return 0;
2720 
2721         BUG_ON(bch2_inode_unpack(k, &u));
2722 
2723         if (S_ISDIR(u.bi_mode))
2724                 return 0;
2725 
2726         if (!u.bi_nlink)
2727                 return 0;
2728 
2729         while ((cmp_int(link->inum, k.k->p.offset) ?:
2730                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2731                 BUG_ON(*idx == links->nr);
2732                 link = &links->d[++*idx];
2733         }
2734 
2735         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2736                         trans, inode_wrong_nlink,
2737                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2738                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2739                         bch2_inode_nlink_get(&u), link->count)) {
2740                 bch2_inode_nlink_set(&u, link->count);
2741                 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2742         }
2743 fsck_err:
2744         return ret;
2745 }
2746 
2747 noinline_for_stack
2748 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2749                                struct nlink_table *links,
2750                                u64 range_start, u64 range_end)
2751 {
2752         size_t idx = 0;
2753 
2754         int ret = bch2_trans_run(c,
2755                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2756                                 POS(0, range_start),
2757                                 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2758                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2759                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2760         if (ret < 0) {
2761                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2762                 return ret;
2763         }
2764 
2765         return 0;
2766 }
2767 
2768 int bch2_check_nlinks(struct bch_fs *c)
2769 {
2770         struct nlink_table links = { 0 };
2771         u64 this_iter_range_start, next_iter_range_start = 0;
2772         int ret = 0;
2773 
2774         do {
2775                 this_iter_range_start = next_iter_range_start;
2776                 next_iter_range_start = U64_MAX;
2777 
2778                 ret = check_nlinks_find_hardlinks(c, &links,
2779                                                   this_iter_range_start,
2780                                                   &next_iter_range_start);
2781 
2782                 ret = check_nlinks_walk_dirents(c, &links,
2783                                           this_iter_range_start,
2784                                           next_iter_range_start);
2785                 if (ret)
2786                         break;
2787 
2788                 ret = check_nlinks_update_hardlinks(c, &links,
2789                                          this_iter_range_start,
2790                                          next_iter_range_start);
2791                 if (ret)
2792                         break;
2793 
2794                 links.nr = 0;
2795         } while (next_iter_range_start != U64_MAX);
2796 
2797         kvfree(links.d);
2798         bch_err_fn(c, ret);
2799         return ret;
2800 }
2801 
2802 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2803                              struct bkey_s_c k)
2804 {
2805         struct bkey_s_c_reflink_p p;
2806         struct bkey_i_reflink_p *u;
2807 
2808         if (k.k->type != KEY_TYPE_reflink_p)
2809                 return 0;
2810 
2811         p = bkey_s_c_to_reflink_p(k);
2812 
2813         if (!p.v->front_pad && !p.v->back_pad)
2814                 return 0;
2815 
2816         u = bch2_trans_kmalloc(trans, sizeof(*u));
2817         int ret = PTR_ERR_OR_ZERO(u);
2818         if (ret)
2819                 return ret;
2820 
2821         bkey_reassemble(&u->k_i, k);
2822         u->v.front_pad  = 0;
2823         u->v.back_pad   = 0;
2824 
2825         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2826 }
2827 
2828 int bch2_fix_reflink_p(struct bch_fs *c)
2829 {
2830         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2831                 return 0;
2832 
2833         int ret = bch2_trans_run(c,
2834                 for_each_btree_key_commit(trans, iter,
2835                                 BTREE_ID_extents, POS_MIN,
2836                                 BTREE_ITER_intent|BTREE_ITER_prefetch|
2837                                 BTREE_ITER_all_snapshots, k,
2838                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2839                         fix_reflink_p_key(trans, &iter, k)));
2840         bch_err_fn(c, ret);
2841         return ret;
2842 }
2843 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php