~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/xfs/scrub/rmap_repair.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
  4  * Author: Darrick J. Wong <djwong@kernel.org>
  5  */
  6 #include "xfs.h"
  7 #include "xfs_fs.h"
  8 #include "xfs_shared.h"
  9 #include "xfs_format.h"
 10 #include "xfs_trans_resv.h"
 11 #include "xfs_mount.h"
 12 #include "xfs_defer.h"
 13 #include "xfs_btree.h"
 14 #include "xfs_btree_staging.h"
 15 #include "xfs_buf_mem.h"
 16 #include "xfs_btree_mem.h"
 17 #include "xfs_bit.h"
 18 #include "xfs_log_format.h"
 19 #include "xfs_trans.h"
 20 #include "xfs_sb.h"
 21 #include "xfs_alloc.h"
 22 #include "xfs_alloc_btree.h"
 23 #include "xfs_ialloc.h"
 24 #include "xfs_ialloc_btree.h"
 25 #include "xfs_rmap.h"
 26 #include "xfs_rmap_btree.h"
 27 #include "xfs_inode.h"
 28 #include "xfs_icache.h"
 29 #include "xfs_bmap.h"
 30 #include "xfs_bmap_btree.h"
 31 #include "xfs_refcount.h"
 32 #include "xfs_refcount_btree.h"
 33 #include "xfs_ag.h"
 34 #include "scrub/xfs_scrub.h"
 35 #include "scrub/scrub.h"
 36 #include "scrub/common.h"
 37 #include "scrub/btree.h"
 38 #include "scrub/trace.h"
 39 #include "scrub/repair.h"
 40 #include "scrub/bitmap.h"
 41 #include "scrub/agb_bitmap.h"
 42 #include "scrub/xfile.h"
 43 #include "scrub/xfarray.h"
 44 #include "scrub/iscan.h"
 45 #include "scrub/newbt.h"
 46 #include "scrub/reap.h"
 47 
 48 /*
 49  * Reverse Mapping Btree Repair
 50  * ============================
 51  *
 52  * This is the most involved of all the AG space btree rebuilds.  Everywhere
 53  * else in XFS we lock inodes and then AG data structures, but generating the
 54  * list of rmap records requires that we be able to scan both block mapping
 55  * btrees of every inode in the filesystem to see if it owns any extents in
 56  * this AG.  We can't tolerate any inode updates while we do this, so we
 57  * freeze the filesystem to lock everyone else out, and grant ourselves
 58  * special privileges to run transactions with regular background reclamation
 59  * turned off.
 60  *
 61  * We also have to be very careful not to allow inode reclaim to start a
 62  * transaction because all transactions (other than our own) will block.
 63  * Deferred inode inactivation helps us out there.
 64  *
 65  * I) Reverse mappings for all non-space metadata and file data are collected
 66  * according to the following algorithm:
 67  *
 68  * 1. For each fork of each inode:
 69  * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
 70  * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
 71  *      bmaps into rmap records (see 1.1.4).  Set bits in BMBIT for each btree
 72  *      block.
 73  * 1.3. If the incore extent map is loaded but the fork is in btree format,
 74  *      just visit the bmbt blocks to set the corresponding BMBIT areas.
 75  * 1.4. From the incore extent map, accumulate each bmap that falls into our
 76  *      target AG.  Remember, multiple bmap records can map to a single rmap
 77  *      record, so we cannot simply emit rmap records 1:1.
 78  * 1.5. Emit rmap records for each extent in BMBIT and free it.
 79  * 2. Create bitmaps INOBIT and ICHUNKBIT.
 80  * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
 81  *    and set bits in INOBIT for each btree block.  If the inobt has no records
 82  *    at all, we must be careful to record its root in INOBIT.
 83  * 4. For each block in the finobt, set the corresponding INOBIT area.
 84  * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
 85  * 6. Create bitmaps REFCBIT and COWBIT.
 86  * 7. For each CoW staging extent in the refcountbt, set the corresponding
 87  *    areas in COWBIT.
 88  * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
 89  * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
 90  * A. Emit rmap for the AG headers.
 91  * B. Emit rmap for the log, if there is one.
 92  *
 93  * II) The rmapbt shape and space metadata rmaps are computed as follows:
 94  *
 95  * 1. Count the rmaps collected in the previous step. (= NR)
 96  * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
 97  * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
 98  * 4. Create bitmap AGBIT.
 99  * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
100  * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
101  * 7. Count the extents in AGBIT. (= AGNR)
102  * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
103  * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
104  *    and clear AGBIT.  Go to step 5.
105  * A. Emit rmaps for each extent in AGBIT.
106  *
107  * III) The rmapbt is constructed and set in place as follows:
108  *
109  * 1. Sort the rmap records.
110  * 2. Bulk load the rmaps.
111  *
112  * IV) Reap the old btree blocks.
113  *
114  * 1. Create a bitmap OLDRMBIT.
115  * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
116  * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
117  * 4. Reap the extents corresponding to the set areas in OLDRMBIT.  These are
118  *    the parts of the AG that the rmap didn't find during its scan of the
119  *    primary metadata and aren't known to be in the free space, which implies
120  *    that they were the old rmapbt blocks.
121  * 5. Commit.
122  *
123  * We use the 'xrep_rmap' prefix for all the rmap functions.
124  */
125 
126 /* Context for collecting rmaps */
127 struct xrep_rmap {
128         /* new rmapbt information */
129         struct xrep_newbt       new_btree;
130 
131         /* lock for the xfbtree and xfile */
132         struct mutex            lock;
133 
134         /* rmap records generated from primary metadata */
135         struct xfbtree          rmap_btree;
136 
137         struct xfs_scrub        *sc;
138 
139         /* in-memory btree cursor for the xfs_btree_bload iteration */
140         struct xfs_btree_cur    *mcur;
141 
142         /* Hooks into rmap update code. */
143         struct xfs_rmap_hook    rhook;
144 
145         /* inode scan cursor */
146         struct xchk_iscan       iscan;
147 
148         /* Number of non-freespace records found. */
149         unsigned long long      nr_records;
150 
151         /* bnobt/cntbt contribution to btreeblks */
152         xfs_agblock_t           freesp_btblocks;
153 
154         /* old agf_rmap_blocks counter */
155         unsigned int            old_rmapbt_fsbcount;
156 };
157 
158 /* Set us up to repair reverse mapping btrees. */
159 int
160 xrep_setup_ag_rmapbt(
161         struct xfs_scrub        *sc)
162 {
163         struct xrep_rmap        *rr;
164         char                    *descr;
165         int                     error;
166 
167         xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
168 
169         descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
170         error = xrep_setup_xfbtree(sc, descr);
171         kfree(descr);
172         if (error)
173                 return error;
174 
175         rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
176         if (!rr)
177                 return -ENOMEM;
178 
179         rr->sc = sc;
180         sc->buf = rr;
181         return 0;
182 }
183 
184 /* Make sure there's nothing funny about this mapping. */
185 STATIC int
186 xrep_rmap_check_mapping(
187         struct xfs_scrub        *sc,
188         const struct xfs_rmap_irec *rec)
189 {
190         enum xbtree_recpacking  outcome;
191         int                     error;
192 
193         if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
194                 return -EFSCORRUPTED;
195 
196         /* Make sure this isn't free space. */
197         error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
198                         rec->rm_blockcount, &outcome);
199         if (error)
200                 return error;
201         if (outcome != XBTREE_RECPACKING_EMPTY)
202                 return -EFSCORRUPTED;
203 
204         return 0;
205 }
206 
207 /* Store a reverse-mapping record. */
208 static inline int
209 xrep_rmap_stash(
210         struct xrep_rmap        *rr,
211         xfs_agblock_t           startblock,
212         xfs_extlen_t            blockcount,
213         uint64_t                owner,
214         uint64_t                offset,
215         unsigned int            flags)
216 {
217         struct xfs_rmap_irec    rmap = {
218                 .rm_startblock  = startblock,
219                 .rm_blockcount  = blockcount,
220                 .rm_owner       = owner,
221                 .rm_offset      = offset,
222                 .rm_flags       = flags,
223         };
224         struct xfs_scrub        *sc = rr->sc;
225         struct xfs_btree_cur    *mcur;
226         int                     error = 0;
227 
228         if (xchk_should_terminate(sc, &error))
229                 return error;
230 
231         if (xchk_iscan_aborted(&rr->iscan))
232                 return -EFSCORRUPTED;
233 
234         trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
235 
236         mutex_lock(&rr->lock);
237         mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
238         error = xfs_rmap_map_raw(mcur, &rmap);
239         xfs_btree_del_cursor(mcur, error);
240         if (error)
241                 goto out_cancel;
242 
243         error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
244         if (error)
245                 goto out_abort;
246 
247         mutex_unlock(&rr->lock);
248         return 0;
249 
250 out_cancel:
251         xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
252 out_abort:
253         xchk_iscan_abort(&rr->iscan);
254         mutex_unlock(&rr->lock);
255         return error;
256 }
257 
258 struct xrep_rmap_stash_run {
259         struct xrep_rmap        *rr;
260         uint64_t                owner;
261         unsigned int            rmap_flags;
262 };
263 
264 static int
265 xrep_rmap_stash_run(
266         uint32_t                        start,
267         uint32_t                        len,
268         void                            *priv)
269 {
270         struct xrep_rmap_stash_run      *rsr = priv;
271         struct xrep_rmap                *rr = rsr->rr;
272 
273         return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
274 }
275 
276 /*
277  * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
278  * that the ranges are in units of FS blocks.
279  */
280 STATIC int
281 xrep_rmap_stash_bitmap(
282         struct xrep_rmap                *rr,
283         struct xagb_bitmap              *bitmap,
284         const struct xfs_owner_info     *oinfo)
285 {
286         struct xrep_rmap_stash_run      rsr = {
287                 .rr                     = rr,
288                 .owner                  = oinfo->oi_owner,
289                 .rmap_flags             = 0,
290         };
291 
292         if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
293                 rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
294         if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
295                 rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
296 
297         return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
298 }
299 
300 /* Section (I): Finding all file and bmbt extents. */
301 
302 /* Context for accumulating rmaps for an inode fork. */
303 struct xrep_rmap_ifork {
304         /*
305          * Accumulate rmap data here to turn multiple adjacent bmaps into a
306          * single rmap.
307          */
308         struct xfs_rmap_irec    accum;
309 
310         /* Bitmap of bmbt blocks in this AG. */
311         struct xagb_bitmap      bmbt_blocks;
312 
313         struct xrep_rmap        *rr;
314 
315         /* Which inode fork? */
316         int                     whichfork;
317 };
318 
319 /* Stash an rmap that we accumulated while walking an inode fork. */
320 STATIC int
321 xrep_rmap_stash_accumulated(
322         struct xrep_rmap_ifork  *rf)
323 {
324         if (rf->accum.rm_blockcount == 0)
325                 return 0;
326 
327         return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
328                         rf->accum.rm_blockcount, rf->accum.rm_owner,
329                         rf->accum.rm_offset, rf->accum.rm_flags);
330 }
331 
332 /* Accumulate a bmbt record. */
333 STATIC int
334 xrep_rmap_visit_bmbt(
335         struct xfs_btree_cur    *cur,
336         struct xfs_bmbt_irec    *rec,
337         void                    *priv)
338 {
339         struct xrep_rmap_ifork  *rf = priv;
340         struct xfs_mount        *mp = rf->rr->sc->mp;
341         struct xfs_rmap_irec    *accum = &rf->accum;
342         xfs_agblock_t           agbno;
343         unsigned int            rmap_flags = 0;
344         int                     error;
345 
346         if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
347                         rf->rr->sc->sa.pag->pag_agno)
348                 return 0;
349 
350         agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
351         if (rf->whichfork == XFS_ATTR_FORK)
352                 rmap_flags |= XFS_RMAP_ATTR_FORK;
353         if (rec->br_state == XFS_EXT_UNWRITTEN)
354                 rmap_flags |= XFS_RMAP_UNWRITTEN;
355 
356         /* If this bmap is adjacent to the previous one, just add it. */
357         if (accum->rm_blockcount > 0 &&
358             rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
359             agbno == accum->rm_startblock + accum->rm_blockcount &&
360             rmap_flags == accum->rm_flags) {
361                 accum->rm_blockcount += rec->br_blockcount;
362                 return 0;
363         }
364 
365         /* Otherwise stash the old rmap and start accumulating a new one. */
366         error = xrep_rmap_stash_accumulated(rf);
367         if (error)
368                 return error;
369 
370         accum->rm_startblock = agbno;
371         accum->rm_blockcount = rec->br_blockcount;
372         accum->rm_offset = rec->br_startoff;
373         accum->rm_flags = rmap_flags;
374         return 0;
375 }
376 
377 /* Add a btree block to the bitmap. */
378 STATIC int
379 xrep_rmap_visit_iroot_btree_block(
380         struct xfs_btree_cur    *cur,
381         int                     level,
382         void                    *priv)
383 {
384         struct xrep_rmap_ifork  *rf = priv;
385         struct xfs_buf          *bp;
386         xfs_fsblock_t           fsbno;
387         xfs_agblock_t           agbno;
388 
389         xfs_btree_get_block(cur, level, &bp);
390         if (!bp)
391                 return 0;
392 
393         fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
394         if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
395                 return 0;
396 
397         agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
398         return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
399 }
400 
401 /*
402  * Iterate a metadata btree rooted in an inode to collect rmap records for
403  * anything in this fork that matches the AG.
404  */
405 STATIC int
406 xrep_rmap_scan_iroot_btree(
407         struct xrep_rmap_ifork  *rf,
408         struct xfs_btree_cur    *cur)
409 {
410         struct xfs_owner_info   oinfo;
411         struct xrep_rmap        *rr = rf->rr;
412         int                     error;
413 
414         xagb_bitmap_init(&rf->bmbt_blocks);
415 
416         /* Record all the blocks in the btree itself. */
417         error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
418                         XFS_BTREE_VISIT_ALL, rf);
419         if (error)
420                 goto out;
421 
422         /* Emit rmaps for the btree blocks. */
423         xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
424         error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
425         if (error)
426                 goto out;
427 
428         /* Stash any remaining accumulated rmaps. */
429         error = xrep_rmap_stash_accumulated(rf);
430 out:
431         xagb_bitmap_destroy(&rf->bmbt_blocks);
432         return error;
433 }
434 
435 /*
436  * Iterate the block mapping btree to collect rmap records for anything in this
437  * fork that matches the AG.  Sets @mappings_done to true if we've scanned the
438  * block mappings in this fork.
439  */
440 STATIC int
441 xrep_rmap_scan_bmbt(
442         struct xrep_rmap_ifork  *rf,
443         struct xfs_inode        *ip,
444         bool                    *mappings_done)
445 {
446         struct xrep_rmap        *rr = rf->rr;
447         struct xfs_btree_cur    *cur;
448         struct xfs_ifork        *ifp;
449         int                     error;
450 
451         *mappings_done = false;
452         ifp = xfs_ifork_ptr(ip, rf->whichfork);
453         cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
454 
455         if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
456             xfs_need_iread_extents(ifp)) {
457                 /*
458                  * If the incore extent cache isn't loaded, scan the bmbt for
459                  * mapping records.  This avoids loading the incore extent
460                  * tree, which will increase memory pressure at a time when
461                  * we're trying to run as quickly as we possibly can.  Ignore
462                  * realtime extents.
463                  */
464                 error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
465                 if (error)
466                         goto out_cur;
467 
468                 *mappings_done = true;
469         }
470 
471         /* Scan for the bmbt blocks, which always live on the data device. */
472         error = xrep_rmap_scan_iroot_btree(rf, cur);
473 out_cur:
474         xfs_btree_del_cursor(cur, error);
475         return error;
476 }
477 
478 /*
479  * Iterate the in-core extent cache to collect rmap records for anything in
480  * this fork that matches the AG.
481  */
482 STATIC int
483 xrep_rmap_scan_iext(
484         struct xrep_rmap_ifork  *rf,
485         struct xfs_ifork        *ifp)
486 {
487         struct xfs_bmbt_irec    rec;
488         struct xfs_iext_cursor  icur;
489         int                     error;
490 
491         for_each_xfs_iext(ifp, &icur, &rec) {
492                 if (isnullstartblock(rec.br_startblock))
493                         continue;
494                 error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
495                 if (error)
496                         return error;
497         }
498 
499         return xrep_rmap_stash_accumulated(rf);
500 }
501 
502 /* Find all the extents from a given AG in an inode fork. */
503 STATIC int
504 xrep_rmap_scan_ifork(
505         struct xrep_rmap        *rr,
506         struct xfs_inode        *ip,
507         int                     whichfork)
508 {
509         struct xrep_rmap_ifork  rf = {
510                 .accum          = { .rm_owner = ip->i_ino, },
511                 .rr             = rr,
512                 .whichfork      = whichfork,
513         };
514         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
515         int                     error = 0;
516 
517         if (!ifp)
518                 return 0;
519 
520         if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
521                 bool            mappings_done;
522 
523                 /*
524                  * Scan the bmap btree for data device mappings.  This includes
525                  * the btree blocks themselves, even if this is a realtime
526                  * file.
527                  */
528                 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
529                 if (error || mappings_done)
530                         return error;
531         } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
532                 return 0;
533         }
534 
535         /* Scan incore extent cache if this isn't a realtime file. */
536         if (xfs_ifork_is_realtime(ip, whichfork))
537                 return 0;
538 
539         return xrep_rmap_scan_iext(&rf, ifp);
540 }
541 
542 /*
543  * Take ILOCK on a file that we want to scan.
544  *
545  * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
546  * attr bmbt.  Otherwise, take ILOCK_SHARED.
547  */
548 static inline unsigned int
549 xrep_rmap_scan_ilock(
550         struct xfs_inode        *ip)
551 {
552         uint                    lock_mode = XFS_ILOCK_SHARED;
553 
554         if (xfs_need_iread_extents(&ip->i_df)) {
555                 lock_mode = XFS_ILOCK_EXCL;
556                 goto lock;
557         }
558 
559         if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
560                 lock_mode = XFS_ILOCK_EXCL;
561 
562 lock:
563         xfs_ilock(ip, lock_mode);
564         return lock_mode;
565 }
566 
567 /* Record reverse mappings for a file. */
568 STATIC int
569 xrep_rmap_scan_inode(
570         struct xrep_rmap        *rr,
571         struct xfs_inode        *ip)
572 {
573         unsigned int            lock_mode = xrep_rmap_scan_ilock(ip);
574         int                     error;
575 
576         /* Check the data fork. */
577         error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
578         if (error)
579                 goto out_unlock;
580 
581         /* Check the attr fork. */
582         error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
583         if (error)
584                 goto out_unlock;
585 
586         /* COW fork extents are "owned" by the refcount btree. */
587 
588         xchk_iscan_mark_visited(&rr->iscan, ip);
589 out_unlock:
590         xfs_iunlock(ip, lock_mode);
591         return error;
592 }
593 
594 /* Section (I): Find all AG metadata extents except for free space metadata. */
595 
596 struct xrep_rmap_inodes {
597         struct xrep_rmap        *rr;
598         struct xagb_bitmap      inobt_blocks;   /* INOBIT */
599         struct xagb_bitmap      ichunk_blocks;  /* ICHUNKBIT */
600 };
601 
602 /* Record inode btree rmaps. */
603 STATIC int
604 xrep_rmap_walk_inobt(
605         struct xfs_btree_cur            *cur,
606         const union xfs_btree_rec       *rec,
607         void                            *priv)
608 {
609         struct xfs_inobt_rec_incore     irec;
610         struct xrep_rmap_inodes         *ri = priv;
611         struct xfs_mount                *mp = cur->bc_mp;
612         xfs_agblock_t                   agbno;
613         xfs_extlen_t                    aglen;
614         xfs_agino_t                     agino;
615         xfs_agino_t                     iperhole;
616         unsigned int                    i;
617         int                             error;
618 
619         /* Record the inobt blocks. */
620         error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
621         if (error)
622                 return error;
623 
624         xfs_inobt_btrec_to_irec(mp, rec, &irec);
625         if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
626                 return -EFSCORRUPTED;
627 
628         agino = irec.ir_startino;
629 
630         /* Record a non-sparse inode chunk. */
631         if (!xfs_inobt_issparse(irec.ir_holemask)) {
632                 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
633                 aglen = max_t(xfs_extlen_t, 1,
634                                 XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
635 
636                 return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
637         }
638 
639         /* Iterate each chunk. */
640         iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
641                         XFS_INODES_PER_HOLEMASK_BIT);
642         aglen = iperhole / mp->m_sb.sb_inopblock;
643         for (i = 0, agino = irec.ir_startino;
644              i < XFS_INOBT_HOLEMASK_BITS;
645              i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
646                 /* Skip holes. */
647                 if (irec.ir_holemask & (1 << i))
648                         continue;
649 
650                 /* Record the inode chunk otherwise. */
651                 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
652                 error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
653                 if (error)
654                         return error;
655         }
656 
657         return 0;
658 }
659 
660 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
661 STATIC int
662 xrep_rmap_find_inode_rmaps(
663         struct xrep_rmap        *rr)
664 {
665         struct xrep_rmap_inodes ri = {
666                 .rr             = rr,
667         };
668         struct xfs_scrub        *sc = rr->sc;
669         int                     error;
670 
671         xagb_bitmap_init(&ri.inobt_blocks);
672         xagb_bitmap_init(&ri.ichunk_blocks);
673 
674         /*
675          * Iterate every record in the inobt so we can capture all the inode
676          * chunks and the blocks in the inobt itself.
677          */
678         error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
679         if (error)
680                 goto out_bitmap;
681 
682         /*
683          * Note that if there are zero records in the inobt then query_all does
684          * nothing and we have to account the empty inobt root manually.
685          */
686         if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
687                 struct xfs_agi  *agi = sc->sa.agi_bp->b_addr;
688 
689                 error = xagb_bitmap_set(&ri.inobt_blocks,
690                                 be32_to_cpu(agi->agi_root), 1);
691                 if (error)
692                         goto out_bitmap;
693         }
694 
695         /* Scan the finobt too. */
696         if (xfs_has_finobt(sc->mp)) {
697                 error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
698                                 sc->sa.fino_cur);
699                 if (error)
700                         goto out_bitmap;
701         }
702 
703         /* Generate rmaps for everything. */
704         error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
705                         &XFS_RMAP_OINFO_INOBT);
706         if (error)
707                 goto out_bitmap;
708         error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
709                         &XFS_RMAP_OINFO_INODES);
710 
711 out_bitmap:
712         xagb_bitmap_destroy(&ri.inobt_blocks);
713         xagb_bitmap_destroy(&ri.ichunk_blocks);
714         return error;
715 }
716 
717 /* Record a CoW staging extent. */
718 STATIC int
719 xrep_rmap_walk_cowblocks(
720         struct xfs_btree_cur            *cur,
721         const struct xfs_refcount_irec  *irec,
722         void                            *priv)
723 {
724         struct xagb_bitmap              *bitmap = priv;
725 
726         if (!xfs_refcount_check_domain(irec) ||
727             irec->rc_domain != XFS_REFC_DOMAIN_COW)
728                 return -EFSCORRUPTED;
729 
730         return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
731 }
732 
733 /*
734  * Collect rmaps for the blocks containing the refcount btree, and all CoW
735  * staging extents.
736  */
737 STATIC int
738 xrep_rmap_find_refcount_rmaps(
739         struct xrep_rmap        *rr)
740 {
741         struct xagb_bitmap      refcountbt_blocks;      /* REFCBIT */
742         struct xagb_bitmap      cow_blocks;             /* COWBIT */
743         struct xfs_refcount_irec low = {
744                 .rc_startblock  = 0,
745                 .rc_domain      = XFS_REFC_DOMAIN_COW,
746         };
747         struct xfs_refcount_irec high = {
748                 .rc_startblock  = -1U,
749                 .rc_domain      = XFS_REFC_DOMAIN_COW,
750         };
751         struct xfs_scrub        *sc = rr->sc;
752         int                     error;
753 
754         if (!xfs_has_reflink(sc->mp))
755                 return 0;
756 
757         xagb_bitmap_init(&refcountbt_blocks);
758         xagb_bitmap_init(&cow_blocks);
759 
760         /* refcountbt */
761         error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
762         if (error)
763                 goto out_bitmap;
764 
765         /* Collect rmaps for CoW staging extents. */
766         error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
767                         xrep_rmap_walk_cowblocks, &cow_blocks);
768         if (error)
769                 goto out_bitmap;
770 
771         /* Generate rmaps for everything. */
772         error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
773         if (error)
774                 goto out_bitmap;
775         error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
776                         &XFS_RMAP_OINFO_REFC);
777 
778 out_bitmap:
779         xagb_bitmap_destroy(&cow_blocks);
780         xagb_bitmap_destroy(&refcountbt_blocks);
781         return error;
782 }
783 
784 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
785 STATIC int
786 xrep_rmap_find_agheader_rmaps(
787         struct xrep_rmap        *rr)
788 {
789         struct xfs_scrub        *sc = rr->sc;
790 
791         /* Create a record for the AG sb->agfl. */
792         return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
793                         XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
794                         XFS_RMAP_OWN_FS, 0, 0);
795 }
796 
797 /* Generate rmaps for the log, if it's in this AG. */
798 STATIC int
799 xrep_rmap_find_log_rmaps(
800         struct xrep_rmap        *rr)
801 {
802         struct xfs_scrub        *sc = rr->sc;
803 
804         if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
805                 return 0;
806 
807         return xrep_rmap_stash(rr,
808                         XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
809                         sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
810 }
811 
812 /* Check and count all the records that we gathered. */
813 STATIC int
814 xrep_rmap_check_record(
815         struct xfs_btree_cur            *cur,
816         const struct xfs_rmap_irec      *rec,
817         void                            *priv)
818 {
819         struct xrep_rmap                *rr = priv;
820         int                             error;
821 
822         error = xrep_rmap_check_mapping(rr->sc, rec);
823         if (error)
824                 return error;
825 
826         rr->nr_records++;
827         return 0;
828 }
829 
830 /*
831  * Generate all the reverse-mappings for this AG, a list of the old rmapbt
832  * blocks, and the new btreeblks count.  Figure out if we have enough free
833  * space to reconstruct the inode btrees.  The caller must clean up the lists
834  * if anything goes wrong.  This implements section (I) above.
835  */
836 STATIC int
837 xrep_rmap_find_rmaps(
838         struct xrep_rmap        *rr)
839 {
840         struct xfs_scrub        *sc = rr->sc;
841         struct xchk_ag          *sa = &sc->sa;
842         struct xfs_inode        *ip;
843         struct xfs_btree_cur    *mcur;
844         int                     error;
845 
846         /* Find all the per-AG metadata. */
847         xrep_ag_btcur_init(sc, &sc->sa);
848 
849         error = xrep_rmap_find_inode_rmaps(rr);
850         if (error)
851                 goto end_agscan;
852 
853         error = xrep_rmap_find_refcount_rmaps(rr);
854         if (error)
855                 goto end_agscan;
856 
857         error = xrep_rmap_find_agheader_rmaps(rr);
858         if (error)
859                 goto end_agscan;
860 
861         error = xrep_rmap_find_log_rmaps(rr);
862 end_agscan:
863         xchk_ag_btcur_free(&sc->sa);
864         if (error)
865                 return error;
866 
867         /*
868          * Set up for a potentially lengthy filesystem scan by reducing our
869          * transaction resource usage for the duration.  Specifically:
870          *
871          * Unlock the AG header buffers and cancel the transaction to release
872          * the log grant space while we scan the filesystem.
873          *
874          * Create a new empty transaction to eliminate the possibility of the
875          * inode scan deadlocking on cyclical metadata.
876          *
877          * We pass the empty transaction to the file scanning function to avoid
878          * repeatedly cycling empty transactions.  This can be done even though
879          * we take the IOLOCK to quiesce the file because empty transactions
880          * do not take sb_internal.
881          */
882         sa->agf_bp = NULL;
883         sa->agi_bp = NULL;
884         xchk_trans_cancel(sc);
885         error = xchk_trans_alloc_empty(sc);
886         if (error)
887                 return error;
888 
889         /* Iterate all AGs for inodes rmaps. */
890         while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
891                 error = xrep_rmap_scan_inode(rr, ip);
892                 xchk_irele(sc, ip);
893                 if (error)
894                         break;
895 
896                 if (xchk_should_terminate(sc, &error))
897                         break;
898         }
899         xchk_iscan_iter_finish(&rr->iscan);
900         if (error)
901                 return error;
902 
903         /*
904          * Switch out for a real transaction and lock the AG headers in
905          * preparation for building a new tree.
906          */
907         xchk_trans_cancel(sc);
908         error = xchk_setup_fs(sc);
909         if (error)
910                 return error;
911         error = xchk_perag_drain_and_lock(sc);
912         if (error)
913                 return error;
914 
915         /*
916          * If a hook failed to update the in-memory btree, we lack the data to
917          * continue the repair.
918          */
919         if (xchk_iscan_aborted(&rr->iscan))
920                 return -EFSCORRUPTED;
921 
922         /*
923          * Now that we have everything locked again, we need to count the
924          * number of rmap records stashed in the btree.  This should reflect
925          * all actively-owned space in the filesystem.  At the same time, check
926          * all our records before we start building a new btree, which requires
927          * a bnobt cursor.
928          */
929         mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
930         sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
931                         sc->sa.pag);
932 
933         rr->nr_records = 0;
934         error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
935 
936         xfs_btree_del_cursor(sc->sa.bno_cur, error);
937         sc->sa.bno_cur = NULL;
938         xfs_btree_del_cursor(mcur, error);
939 
940         return error;
941 }
942 
943 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */
944 
945 struct xrep_rmap_agfl {
946         struct xagb_bitmap      *bitmap;
947         xfs_agnumber_t          agno;
948 };
949 
950 /* Add an AGFL block to the rmap list. */
951 STATIC int
952 xrep_rmap_walk_agfl(
953         struct xfs_mount        *mp,
954         xfs_agblock_t           agbno,
955         void                    *priv)
956 {
957         struct xrep_rmap_agfl   *ra = priv;
958 
959         return xagb_bitmap_set(ra->bitmap, agbno, 1);
960 }
961 
962 /*
963  * Run one round of reserving space for the new rmapbt and recomputing the
964  * number of blocks needed to store the previously observed rmapbt records and
965  * the ones we'll create for the free space metadata.  When we don't need more
966  * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
967  * true.
968  */
969 STATIC int
970 xrep_rmap_try_reserve(
971         struct xrep_rmap        *rr,
972         struct xfs_btree_cur    *rmap_cur,
973         struct xagb_bitmap      *freesp_blocks,
974         uint64_t                *blocks_reserved,
975         bool                    *done)
976 {
977         struct xrep_rmap_agfl   ra = {
978                 .bitmap         = freesp_blocks,
979                 .agno           = rr->sc->sa.pag->pag_agno,
980         };
981         struct xfs_scrub        *sc = rr->sc;
982         struct xrep_newbt_resv  *resv, *n;
983         struct xfs_agf          *agf = sc->sa.agf_bp->b_addr;
984         struct xfs_buf          *agfl_bp;
985         uint64_t                nr_blocks;      /* RMB */
986         uint64_t                freesp_records;
987         int                     error;
988 
989         /*
990          * We're going to recompute new_btree.bload.nr_blocks at the end of
991          * this function to reflect however many btree blocks we need to store
992          * all the rmap records (including the ones that reflect the changes we
993          * made to support the new rmapbt blocks), so we save the old value
994          * here so we can decide if we've reserved enough blocks.
995          */
996         nr_blocks = rr->new_btree.bload.nr_blocks;
997 
998         /*
999          * Make sure we've reserved enough space for the new btree.  This can
1000          * change the shape of the free space btrees, which can cause secondary
1001          * interactions with the rmap records because all three space btrees
1002          * have the same rmap owner.  We'll account for all that below.
1003          */
1004         error = xrep_newbt_alloc_blocks(&rr->new_btree,
1005                         nr_blocks - *blocks_reserved);
1006         if (error)
1007                 return error;
1008 
1009         *blocks_reserved = rr->new_btree.bload.nr_blocks;
1010 
1011         /* Clear everything in the bitmap. */
1012         xagb_bitmap_destroy(freesp_blocks);
1013 
1014         /* Set all the bnobt blocks in the bitmap. */
1015         sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1016                         sc->sa.pag);
1017         error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1018         xfs_btree_del_cursor(sc->sa.bno_cur, error);
1019         sc->sa.bno_cur = NULL;
1020         if (error)
1021                 return error;
1022 
1023         /* Set all the cntbt blocks in the bitmap. */
1024         sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1025                         sc->sa.pag);
1026         error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1027         xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1028         sc->sa.cnt_cur = NULL;
1029         if (error)
1030                 return error;
1031 
1032         /* Record our new btreeblks value. */
1033         rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1034 
1035         /* Set all the new rmapbt blocks in the bitmap. */
1036         list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1037                 error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1038                 if (error)
1039                         return error;
1040         }
1041 
1042         /* Set all the AGFL blocks in the bitmap. */
1043         error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1044         if (error)
1045                 return error;
1046 
1047         error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1048         if (error)
1049                 return error;
1050 
1051         /* Count the extents in the bitmap. */
1052         freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1053 
1054         /* Compute how many blocks we'll need for all the rmaps. */
1055         error = xfs_btree_bload_compute_geometry(rmap_cur,
1056                         &rr->new_btree.bload, rr->nr_records + freesp_records);
1057         if (error)
1058                 return error;
1059 
1060         /* We're done when we don't need more blocks. */
1061         *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1062         return 0;
1063 }
1064 
1065 /*
1066  * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1067  * the free space metadata.  This implements section (II) above.
1068  */
1069 STATIC int
1070 xrep_rmap_reserve_space(
1071         struct xrep_rmap        *rr,
1072         struct xfs_btree_cur    *rmap_cur)
1073 {
1074         struct xagb_bitmap      freesp_blocks;  /* AGBIT */
1075         uint64_t                blocks_reserved = 0;
1076         bool                    done = false;
1077         int                     error;
1078 
1079         /* Compute how many blocks we'll need for the rmaps collected so far. */
1080         error = xfs_btree_bload_compute_geometry(rmap_cur,
1081                         &rr->new_btree.bload, rr->nr_records);
1082         if (error)
1083                 return error;
1084 
1085         /* Last chance to abort before we start committing fixes. */
1086         if (xchk_should_terminate(rr->sc, &error))
1087                 return error;
1088 
1089         xagb_bitmap_init(&freesp_blocks);
1090 
1091         /*
1092          * Iteratively reserve space for the new rmapbt and recompute the
1093          * number of blocks needed to store the previously observed rmapbt
1094          * records and the ones we'll create for the free space metadata.
1095          * Finish when we don't need more blocks.
1096          */
1097         do {
1098                 error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1099                                 &blocks_reserved, &done);
1100                 if (error)
1101                         goto out_bitmap;
1102         } while (!done);
1103 
1104         /* Emit rmaps for everything in the free space bitmap. */
1105         xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1106         error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1107         xchk_ag_btcur_free(&rr->sc->sa);
1108 
1109 out_bitmap:
1110         xagb_bitmap_destroy(&freesp_blocks);
1111         return error;
1112 }
1113 
1114 /* Section (III): Building the new rmap btree. */
1115 
1116 /* Update the AGF counters. */
1117 STATIC int
1118 xrep_rmap_reset_counters(
1119         struct xrep_rmap        *rr)
1120 {
1121         struct xfs_scrub        *sc = rr->sc;
1122         struct xfs_perag        *pag = sc->sa.pag;
1123         struct xfs_agf          *agf = sc->sa.agf_bp->b_addr;
1124         xfs_agblock_t           rmap_btblocks;
1125 
1126         /*
1127          * The AGF header contains extra information related to the reverse
1128          * mapping btree, so we must update those fields here.
1129          */
1130         rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1131         agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1132         xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1133 
1134         /*
1135          * After we commit the new btree to disk, it is possible that the
1136          * process to reap the old btree blocks will race with the AIL trying
1137          * to checkpoint the old btree blocks into the filesystem.  If the new
1138          * tree is shorter than the old one, the rmapbt write verifier will
1139          * fail and the AIL will shut down the filesystem.
1140          *
1141          * To avoid this, save the old incore btree height values as the alt
1142          * height values before re-initializing the perag info from the updated
1143          * AGF to capture all the new values.
1144          */
1145         pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1146 
1147         /* Reinitialize with the values we just logged. */
1148         return xrep_reinit_pagf(sc);
1149 }
1150 
1151 /* Retrieve rmapbt data for bulk load. */
1152 STATIC int
1153 xrep_rmap_get_records(
1154         struct xfs_btree_cur    *cur,
1155         unsigned int            idx,
1156         struct xfs_btree_block  *block,
1157         unsigned int            nr_wanted,
1158         void                    *priv)
1159 {
1160         struct xrep_rmap        *rr = priv;
1161         union xfs_btree_rec     *block_rec;
1162         unsigned int            loaded;
1163         int                     error;
1164 
1165         for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1166                 int             stat = 0;
1167 
1168                 error = xfs_btree_increment(rr->mcur, 0, &stat);
1169                 if (error)
1170                         return error;
1171                 if (!stat)
1172                         return -EFSCORRUPTED;
1173 
1174                 error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1175                 if (error)
1176                         return error;
1177                 if (!stat)
1178                         return -EFSCORRUPTED;
1179 
1180                 block_rec = xfs_btree_rec_addr(cur, idx, block);
1181                 cur->bc_ops->init_rec_from_cur(cur, block_rec);
1182         }
1183 
1184         return loaded;
1185 }
1186 
1187 /* Feed one of the new btree blocks to the bulk loader. */
1188 STATIC int
1189 xrep_rmap_claim_block(
1190         struct xfs_btree_cur    *cur,
1191         union xfs_btree_ptr     *ptr,
1192         void                    *priv)
1193 {
1194         struct xrep_rmap        *rr = priv;
1195 
1196         return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1197 }
1198 
1199 /* Custom allocation function for new rmap btrees. */
1200 STATIC int
1201 xrep_rmap_alloc_vextent(
1202         struct xfs_scrub        *sc,
1203         struct xfs_alloc_arg    *args,
1204         xfs_fsblock_t           alloc_hint)
1205 {
1206         int                     error;
1207 
1208         /*
1209          * We don't want an rmap update on the allocation, since we iteratively
1210          * compute the OWN_AG records /after/ allocating blocks for the records
1211          * that we already know we need to store.  Therefore, fix the freelist
1212          * with the NORMAP flag set so that we don't also try to create an rmap
1213          * for new AGFL blocks.
1214          */
1215         error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1216         if (error)
1217                 return error;
1218 
1219         /*
1220          * If xrep_fix_freelist fixed the freelist by moving blocks from the
1221          * free space btrees or by removing blocks from the AGFL and queueing
1222          * an EFI to free the block, the transaction will be dirty.  This
1223          * second case is of interest to us.
1224          *
1225          * Later on, we will need to compare gaps in the new recordset against
1226          * the block usage of all OWN_AG owners in order to free the old
1227          * btree's blocks, which means that we can't have EFIs for former AGFL
1228          * blocks attached to the repair transaction when we commit the new
1229          * btree.
1230          *
1231          * xrep_newbt_alloc_blocks guarantees this for us by calling
1232          * xrep_defer_finish to commit anything that fix_freelist may have
1233          * added to the transaction.
1234          */
1235         return xfs_alloc_vextent_near_bno(args, alloc_hint);
1236 }
1237 
1238 
1239 /* Count the records in this btree. */
1240 STATIC int
1241 xrep_rmap_count_records(
1242         struct xfs_btree_cur    *cur,
1243         unsigned long long      *nr)
1244 {
1245         int                     running = 1;
1246         int                     error;
1247 
1248         *nr = 0;
1249 
1250         error = xfs_btree_goto_left_edge(cur);
1251         if (error)
1252                 return error;
1253 
1254         while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1255                 if (running)
1256                         (*nr)++;
1257         }
1258 
1259         return error;
1260 }
1261 /*
1262  * Use the collected rmap information to stage a new rmap btree.  If this is
1263  * successful we'll return with the new btree root information logged to the
1264  * repair transaction but not yet committed.  This implements section (III)
1265  * above.
1266  */
1267 STATIC int
1268 xrep_rmap_build_new_tree(
1269         struct xrep_rmap        *rr)
1270 {
1271         struct xfs_scrub        *sc = rr->sc;
1272         struct xfs_perag        *pag = sc->sa.pag;
1273         struct xfs_agf          *agf = sc->sa.agf_bp->b_addr;
1274         struct xfs_btree_cur    *rmap_cur;
1275         xfs_fsblock_t           fsbno;
1276         int                     error;
1277 
1278         /*
1279          * Preserve the old rmapbt block count so that we can adjust the
1280          * per-AG rmapbt reservation after we commit the new btree root and
1281          * want to dispose of the old btree blocks.
1282          */
1283         rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1284 
1285         /*
1286          * Prepare to construct the new btree by reserving disk space for the
1287          * new btree and setting up all the accounting information we'll need
1288          * to root the new btree while it's under construction and before we
1289          * attach it to the AG header.  The new blocks are accounted to the
1290          * rmapbt per-AG reservation, which we will adjust further after
1291          * committing the new btree.
1292          */
1293         fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
1294         xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1295                         fsbno, XFS_AG_RESV_RMAPBT);
1296         rr->new_btree.bload.get_records = xrep_rmap_get_records;
1297         rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1298         rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1299         rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1300         xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1301 
1302         /*
1303          * Initialize @rr->new_btree, reserve space for the new rmapbt,
1304          * and compute OWN_AG rmaps.
1305          */
1306         error = xrep_rmap_reserve_space(rr, rmap_cur);
1307         if (error)
1308                 goto err_cur;
1309 
1310         /*
1311          * Count the rmapbt records again, because the space reservation
1312          * for the rmapbt itself probably added more records to the btree.
1313          */
1314         rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1315                         &rr->rmap_btree);
1316 
1317         error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1318         if (error)
1319                 goto err_mcur;
1320 
1321         /*
1322          * Due to btree slack factors, it's possible for a new btree to be one
1323          * level taller than the old btree.  Update the incore btree height so
1324          * that we don't trip the verifiers when writing the new btree blocks
1325          * to disk.
1326          */
1327         pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1328 
1329         /*
1330          * Move the cursor to the left edge of the tree so that the first
1331          * increment in ->get_records positions us at the first record.
1332          */
1333         error = xfs_btree_goto_left_edge(rr->mcur);
1334         if (error)
1335                 goto err_level;
1336 
1337         /* Add all observed rmap records. */
1338         error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1339         if (error)
1340                 goto err_level;
1341 
1342         /*
1343          * Install the new btree in the AG header.  After this point the old
1344          * btree is no longer accessible and the new tree is live.
1345          */
1346         xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1347         xfs_btree_del_cursor(rmap_cur, 0);
1348         xfs_btree_del_cursor(rr->mcur, 0);
1349         rr->mcur = NULL;
1350 
1351         /*
1352          * Now that we've written the new btree to disk, we don't need to keep
1353          * updating the in-memory btree.  Abort the scan to stop live updates.
1354          */
1355         xchk_iscan_abort(&rr->iscan);
1356 
1357         /*
1358          * The newly committed rmap recordset includes mappings for the blocks
1359          * that we reserved to build the new btree.  If there is excess space
1360          * reservation to be freed, the corresponding rmap records must also be
1361          * removed.
1362          */
1363         rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1364 
1365         /* Reset the AGF counters now that we've changed the btree shape. */
1366         error = xrep_rmap_reset_counters(rr);
1367         if (error)
1368                 goto err_newbt;
1369 
1370         /* Dispose of any unused blocks and the accounting information. */
1371         error = xrep_newbt_commit(&rr->new_btree);
1372         if (error)
1373                 return error;
1374 
1375         return xrep_roll_ag_trans(sc);
1376 
1377 err_level:
1378         pag->pagf_repair_rmap_level = 0;
1379 err_mcur:
1380         xfs_btree_del_cursor(rr->mcur, error);
1381 err_cur:
1382         xfs_btree_del_cursor(rmap_cur, error);
1383 err_newbt:
1384         xrep_newbt_cancel(&rr->new_btree);
1385         return error;
1386 }
1387 
1388 /* Section (IV): Reaping the old btree. */
1389 
1390 struct xrep_rmap_find_gaps {
1391         struct xagb_bitmap      rmap_gaps;
1392         xfs_agblock_t           next_agbno;
1393 };
1394 
1395 /* Subtract each free extent in the bnobt from the rmap gaps. */
1396 STATIC int
1397 xrep_rmap_find_freesp(
1398         struct xfs_btree_cur            *cur,
1399         const struct xfs_alloc_rec_incore *rec,
1400         void                            *priv)
1401 {
1402         struct xrep_rmap_find_gaps      *rfg = priv;
1403 
1404         return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1405                         rec->ar_blockcount);
1406 }
1407 
1408 /* Record the free space we find, as part of cleaning out the btree. */
1409 STATIC int
1410 xrep_rmap_find_gaps(
1411         struct xfs_btree_cur            *cur,
1412         const struct xfs_rmap_irec      *rec,
1413         void                            *priv)
1414 {
1415         struct xrep_rmap_find_gaps      *rfg = priv;
1416         int                             error;
1417 
1418         if (rec->rm_startblock > rfg->next_agbno) {
1419                 error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1420                                 rec->rm_startblock - rfg->next_agbno);
1421                 if (error)
1422                         return error;
1423         }
1424 
1425         rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1426                                 rec->rm_startblock + rec->rm_blockcount);
1427         return 0;
1428 }
1429 
1430 /*
1431  * Reap the old rmapbt blocks.  Now that the rmapbt is fully rebuilt, we make
1432  * a list of gaps in the rmap records and a list of the extents mentioned in
1433  * the bnobt.  Any block that's in the new rmapbt gap list but not mentioned
1434  * in the bnobt is a block from the old rmapbt and can be removed.
1435  */
1436 STATIC int
1437 xrep_rmap_remove_old_tree(
1438         struct xrep_rmap        *rr)
1439 {
1440         struct xrep_rmap_find_gaps rfg = {
1441                 .next_agbno     = 0,
1442         };
1443         struct xfs_scrub        *sc = rr->sc;
1444         struct xfs_agf          *agf = sc->sa.agf_bp->b_addr;
1445         struct xfs_perag        *pag = sc->sa.pag;
1446         struct xfs_btree_cur    *mcur;
1447         xfs_agblock_t           agend;
1448         int                     error;
1449 
1450         xagb_bitmap_init(&rfg.rmap_gaps);
1451 
1452         /* Compute free space from the new rmapbt. */
1453         mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1454 
1455         error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1456         xfs_btree_del_cursor(mcur, error);
1457         if (error)
1458                 goto out_bitmap;
1459 
1460         /* Insert a record for space between the last rmap and EOAG. */
1461         agend = be32_to_cpu(agf->agf_length);
1462         if (rfg.next_agbno < agend) {
1463                 error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1464                                 agend - rfg.next_agbno);
1465                 if (error)
1466                         goto out_bitmap;
1467         }
1468 
1469         /* Compute free space from the existing bnobt. */
1470         sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1471                         sc->sa.pag);
1472         error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1473                         &rfg);
1474         xfs_btree_del_cursor(sc->sa.bno_cur, error);
1475         sc->sa.bno_cur = NULL;
1476         if (error)
1477                 goto out_bitmap;
1478 
1479         /*
1480          * Free the "free" blocks that the new rmapbt knows about but the bnobt
1481          * doesn't--these are the old rmapbt blocks.  Credit the old rmapbt
1482          * block usage count back to the per-AG rmapbt reservation (and not
1483          * fdblocks, since the rmap btree lives in free space) to keep the
1484          * reservation and free space accounting correct.
1485          */
1486         error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1487                         &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1488         if (error)
1489                 goto out_bitmap;
1490 
1491         /*
1492          * Now that we've zapped all the old rmapbt blocks we can turn off
1493          * the alternate height mechanism and reset the per-AG space
1494          * reservation.
1495          */
1496         pag->pagf_repair_rmap_level = 0;
1497         sc->flags |= XREP_RESET_PERAG_RESV;
1498 out_bitmap:
1499         xagb_bitmap_destroy(&rfg.rmap_gaps);
1500         return error;
1501 }
1502 
1503 static inline bool
1504 xrep_rmapbt_want_live_update(
1505         struct xchk_iscan               *iscan,
1506         const struct xfs_owner_info     *oi)
1507 {
1508         if (xchk_iscan_aborted(iscan))
1509                 return false;
1510 
1511         /*
1512          * Before unlocking the AG header to perform the inode scan, we
1513          * recorded reverse mappings for all AG metadata except for the OWN_AG
1514          * metadata.  IOWs, the in-memory btree knows about the AG headers, the
1515          * two inode btrees, the CoW staging extents, and the refcount btrees.
1516          * For these types of metadata, we need to record the live updates in
1517          * the in-memory rmap btree.
1518          *
1519          * However, we do not scan the free space btrees or the AGFL until we
1520          * have re-locked the AGF and are ready to reserve space for the new
1521          * rmap btree, so we do not want live updates for OWN_AG metadata.
1522          */
1523         if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1524                 return oi->oi_owner != XFS_RMAP_OWN_AG;
1525 
1526         /* Ignore updates to files that the scanner hasn't visited yet. */
1527         return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1528 }
1529 
1530 /*
1531  * Apply a rmapbt update from the regular filesystem into our shadow btree.
1532  * We're running from the thread that owns the AGF buffer and is generating
1533  * the update, so we must be careful about which parts of the struct xrep_rmap
1534  * that we change.
1535  */
1536 static int
1537 xrep_rmapbt_live_update(
1538         struct notifier_block           *nb,
1539         unsigned long                   action,
1540         void                            *data)
1541 {
1542         struct xfs_rmap_update_params   *p = data;
1543         struct xrep_rmap                *rr;
1544         struct xfs_mount                *mp;
1545         struct xfs_btree_cur            *mcur;
1546         struct xfs_trans                *tp;
1547         void                            *txcookie;
1548         int                             error;
1549 
1550         rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1551         mp = rr->sc->mp;
1552 
1553         if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1554                 goto out_unlock;
1555 
1556         trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
1557 
1558         error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
1559         if (error)
1560                 goto out_abort;
1561 
1562         mutex_lock(&rr->lock);
1563         mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1564         error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1565                         p->blockcount, &p->oinfo, p->unwritten);
1566         xfs_btree_del_cursor(mcur, error);
1567         if (error)
1568                 goto out_cancel;
1569 
1570         error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1571         if (error)
1572                 goto out_cancel;
1573 
1574         xrep_trans_cancel_hook_dummy(&txcookie, tp);
1575         mutex_unlock(&rr->lock);
1576         return NOTIFY_DONE;
1577 
1578 out_cancel:
1579         xfbtree_trans_cancel(&rr->rmap_btree, tp);
1580         xrep_trans_cancel_hook_dummy(&txcookie, tp);
1581 out_abort:
1582         mutex_unlock(&rr->lock);
1583         xchk_iscan_abort(&rr->iscan);
1584 out_unlock:
1585         return NOTIFY_DONE;
1586 }
1587 
1588 /* Set up the filesystem scan components. */
1589 STATIC int
1590 xrep_rmap_setup_scan(
1591         struct xrep_rmap        *rr)
1592 {
1593         struct xfs_scrub        *sc = rr->sc;
1594         int                     error;
1595 
1596         mutex_init(&rr->lock);
1597 
1598         /* Set up in-memory rmap btree */
1599         error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1600                         sc->sa.pag->pag_agno);
1601         if (error)
1602                 goto out_mutex;
1603 
1604         /* Retry iget every tenth of a second for up to 30 seconds. */
1605         xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1606 
1607         /*
1608          * Hook into live rmap operations so that we can update our in-memory
1609          * btree to reflect live changes on the filesystem.  Since we drop the
1610          * AGF buffer to scan all the inodes, we need this piece to avoid
1611          * installing a stale btree.
1612          */
1613         ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1614         xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1615         error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
1616         if (error)
1617                 goto out_iscan;
1618         return 0;
1619 
1620 out_iscan:
1621         xchk_iscan_teardown(&rr->iscan);
1622         xfbtree_destroy(&rr->rmap_btree);
1623 out_mutex:
1624         mutex_destroy(&rr->lock);
1625         return error;
1626 }
1627 
1628 /* Tear down scan components. */
1629 STATIC void
1630 xrep_rmap_teardown(
1631         struct xrep_rmap        *rr)
1632 {
1633         struct xfs_scrub        *sc = rr->sc;
1634 
1635         xchk_iscan_abort(&rr->iscan);
1636         xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
1637         xchk_iscan_teardown(&rr->iscan);
1638         xfbtree_destroy(&rr->rmap_btree);
1639         mutex_destroy(&rr->lock);
1640 }
1641 
1642 /* Repair the rmap btree for some AG. */
1643 int
1644 xrep_rmapbt(
1645         struct xfs_scrub        *sc)
1646 {
1647         struct xrep_rmap        *rr = sc->buf;
1648         int                     error;
1649 
1650         error = xrep_rmap_setup_scan(rr);
1651         if (error)
1652                 return error;
1653 
1654         /*
1655          * Collect rmaps for everything in this AG that isn't space metadata.
1656          * These rmaps won't change even as we try to allocate blocks.
1657          */
1658         error = xrep_rmap_find_rmaps(rr);
1659         if (error)
1660                 goto out_records;
1661 
1662         /* Rebuild the rmap information. */
1663         error = xrep_rmap_build_new_tree(rr);
1664         if (error)
1665                 goto out_records;
1666 
1667         /* Kill the old tree. */
1668         error = xrep_rmap_remove_old_tree(rr);
1669         if (error)
1670                 goto out_records;
1671 
1672 out_records:
1673         xrep_rmap_teardown(rr);
1674         return error;
1675 }
1676 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php