~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/xfs/scrub/alloc_repair.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
  4  * Author: Darrick J. Wong <djwong@kernel.org>
  5  */
  6 #include "xfs.h"
  7 #include "xfs_fs.h"
  8 #include "xfs_shared.h"
  9 #include "xfs_format.h"
 10 #include "xfs_trans_resv.h"
 11 #include "xfs_mount.h"
 12 #include "xfs_defer.h"
 13 #include "xfs_btree.h"
 14 #include "xfs_btree_staging.h"
 15 #include "xfs_bit.h"
 16 #include "xfs_log_format.h"
 17 #include "xfs_trans.h"
 18 #include "xfs_sb.h"
 19 #include "xfs_alloc.h"
 20 #include "xfs_alloc_btree.h"
 21 #include "xfs_rmap.h"
 22 #include "xfs_rmap_btree.h"
 23 #include "xfs_inode.h"
 24 #include "xfs_refcount.h"
 25 #include "xfs_extent_busy.h"
 26 #include "xfs_health.h"
 27 #include "xfs_bmap.h"
 28 #include "xfs_ialloc.h"
 29 #include "xfs_ag.h"
 30 #include "scrub/xfs_scrub.h"
 31 #include "scrub/scrub.h"
 32 #include "scrub/common.h"
 33 #include "scrub/btree.h"
 34 #include "scrub/trace.h"
 35 #include "scrub/repair.h"
 36 #include "scrub/bitmap.h"
 37 #include "scrub/agb_bitmap.h"
 38 #include "scrub/xfile.h"
 39 #include "scrub/xfarray.h"
 40 #include "scrub/newbt.h"
 41 #include "scrub/reap.h"
 42 
 43 /*
 44  * Free Space Btree Repair
 45  * =======================
 46  *
 47  * The reverse mappings are supposed to record all space usage for the entire
 48  * AG.  Therefore, we can recreate the free extent records in an AG by looking
 49  * for gaps in the physical extents recorded in the rmapbt.  These records are
 50  * staged in @free_records.  Identifying the gaps is more difficult on a
 51  * reflink filesystem because rmap records are allowed to overlap.
 52  *
 53  * Because the final step of building a new index is to free the space used by
 54  * the old index, repair needs to find that space.  Unfortunately, all
 55  * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
 56  * the same rmapbt owner code (OWN_AG), so this is not straightforward.
 57  *
 58  * The scan of the reverse mapping information records the space used by OWN_AG
 59  * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed.  While
 60  * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
 61  * record all visited rmap btree blocks and all blocks owned by the AGFL.
 62  *
 63  * After that is where the definitions of old_allocbt_blocks shifts.  This
 64  * expression identifies possible former bnobt/cntbt blocks:
 65  *
 66  *      (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
 67  *
 68  * Substituting from above definitions, that becomes:
 69  *
 70  *      old_allocbt_blocks & ~not_allocbt_blocks
 71  *
 72  * The OWN_AG bitmap itself isn't needed after this point, so what we really do
 73  * instead is:
 74  *
 75  *      old_allocbt_blocks &= ~not_allocbt_blocks;
 76  *
 77  * After this point, @old_allocbt_blocks is a bitmap of alleged former
 78  * bnobt/cntbt blocks.  The xagb_bitmap_disunion operation modifies its first
 79  * parameter in place to avoid copying records around.
 80  *
 81  * Next, some of the space described by @free_records are diverted to the newbt
 82  * reservation and used to format new btree blocks.  The remaining records are
 83  * written to the new btree indices.  We reconstruct both bnobt and cntbt at
 84  * the same time since we've already done all the work.
 85  *
 86  * We use the prefix 'xrep_abt' here because we regenerate both free space
 87  * allocation btrees at the same time.
 88  */
 89 
 90 struct xrep_abt {
 91         /* Blocks owned by the rmapbt or the agfl. */
 92         struct xagb_bitmap      not_allocbt_blocks;
 93 
 94         /* All OWN_AG blocks. */
 95         struct xagb_bitmap      old_allocbt_blocks;
 96 
 97         /*
 98          * New bnobt information.  All btree block reservations are added to
 99          * the reservation list in new_bnobt.
100          */
101         struct xrep_newbt       new_bnobt;
102 
103         /* new cntbt information */
104         struct xrep_newbt       new_cntbt;
105 
106         /* Free space extents. */
107         struct xfarray          *free_records;
108 
109         struct xfs_scrub        *sc;
110 
111         /* Number of non-null records in @free_records. */
112         uint64_t                nr_real_records;
113 
114         /* get_records()'s position in the free space record array. */
115         xfarray_idx_t           array_cur;
116 
117         /*
118          * Next block we anticipate seeing in the rmap records.  If the next
119          * rmap record is greater than next_agbno, we have found unused space.
120          */
121         xfs_agblock_t           next_agbno;
122 
123         /* Number of free blocks in this AG. */
124         xfs_agblock_t           nr_blocks;
125 
126         /* Longest free extent we found in the AG. */
127         xfs_agblock_t           longest;
128 };
129 
130 /* Set up to repair AG free space btrees. */
131 int
132 xrep_setup_ag_allocbt(
133         struct xfs_scrub        *sc)
134 {
135         unsigned int            busy_gen;
136 
137         /*
138          * Make sure the busy extent list is clear because we can't put extents
139          * on there twice.
140          */
141         busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
142         if (xfs_extent_busy_list_empty(sc->sa.pag))
143                 return 0;
144 
145         return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
146 }
147 
148 /* Check for any obvious conflicts in the free extent. */
149 STATIC int
150 xrep_abt_check_free_ext(
151         struct xfs_scrub        *sc,
152         const struct xfs_alloc_rec_incore *rec)
153 {
154         enum xbtree_recpacking  outcome;
155         int                     error;
156 
157         if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
158                 return -EFSCORRUPTED;
159 
160         /* Must not be an inode chunk. */
161         error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
162                         rec->ar_startblock, rec->ar_blockcount, &outcome);
163         if (error)
164                 return error;
165         if (outcome != XBTREE_RECPACKING_EMPTY)
166                 return -EFSCORRUPTED;
167 
168         /* Must not be shared or CoW staging. */
169         if (sc->sa.refc_cur) {
170                 error = xfs_refcount_has_records(sc->sa.refc_cur,
171                                 XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
172                                 rec->ar_blockcount, &outcome);
173                 if (error)
174                         return error;
175                 if (outcome != XBTREE_RECPACKING_EMPTY)
176                         return -EFSCORRUPTED;
177 
178                 error = xfs_refcount_has_records(sc->sa.refc_cur,
179                                 XFS_REFC_DOMAIN_COW, rec->ar_startblock,
180                                 rec->ar_blockcount, &outcome);
181                 if (error)
182                         return error;
183                 if (outcome != XBTREE_RECPACKING_EMPTY)
184                         return -EFSCORRUPTED;
185         }
186 
187         return 0;
188 }
189 
190 /*
191  * Stash a free space record for all the space since the last bno we found
192  * all the way up to @end.
193  */
194 static int
195 xrep_abt_stash(
196         struct xrep_abt         *ra,
197         xfs_agblock_t           end)
198 {
199         struct xfs_alloc_rec_incore arec = {
200                 .ar_startblock  = ra->next_agbno,
201                 .ar_blockcount  = end - ra->next_agbno,
202         };
203         struct xfs_scrub        *sc = ra->sc;
204         int                     error = 0;
205 
206         if (xchk_should_terminate(sc, &error))
207                 return error;
208 
209         error = xrep_abt_check_free_ext(ra->sc, &arec);
210         if (error)
211                 return error;
212 
213         trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
214 
215         error = xfarray_append(ra->free_records, &arec);
216         if (error)
217                 return error;
218 
219         ra->nr_blocks += arec.ar_blockcount;
220         return 0;
221 }
222 
223 /* Record extents that aren't in use from gaps in the rmap records. */
224 STATIC int
225 xrep_abt_walk_rmap(
226         struct xfs_btree_cur            *cur,
227         const struct xfs_rmap_irec      *rec,
228         void                            *priv)
229 {
230         struct xrep_abt                 *ra = priv;
231         int                             error;
232 
233         /* Record all the OWN_AG blocks... */
234         if (rec->rm_owner == XFS_RMAP_OWN_AG) {
235                 error = xagb_bitmap_set(&ra->old_allocbt_blocks,
236                                 rec->rm_startblock, rec->rm_blockcount);
237                 if (error)
238                         return error;
239         }
240 
241         /* ...and all the rmapbt blocks... */
242         error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
243         if (error)
244                 return error;
245 
246         /* ...and all the free space. */
247         if (rec->rm_startblock > ra->next_agbno) {
248                 error = xrep_abt_stash(ra, rec->rm_startblock);
249                 if (error)
250                         return error;
251         }
252 
253         /*
254          * rmap records can overlap on reflink filesystems, so project
255          * next_agbno as far out into the AG space as we currently know about.
256          */
257         ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
258                         rec->rm_startblock + rec->rm_blockcount);
259         return 0;
260 }
261 
262 /* Collect an AGFL block for the not-to-release list. */
263 static int
264 xrep_abt_walk_agfl(
265         struct xfs_mount        *mp,
266         xfs_agblock_t           agbno,
267         void                    *priv)
268 {
269         struct xrep_abt         *ra = priv;
270 
271         return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
272 }
273 
274 /*
275  * Compare two free space extents by block number.  We want to sort in order of
276  * increasing block number.
277  */
278 static int
279 xrep_bnobt_extent_cmp(
280         const void              *a,
281         const void              *b)
282 {
283         const struct xfs_alloc_rec_incore *ap = a;
284         const struct xfs_alloc_rec_incore *bp = b;
285 
286         if (ap->ar_startblock > bp->ar_startblock)
287                 return 1;
288         else if (ap->ar_startblock < bp->ar_startblock)
289                 return -1;
290         return 0;
291 }
292 
293 /*
294  * Re-sort the free extents by block number so that we can put the records into
295  * the bnobt in the correct order.  Make sure the records do not overlap in
296  * physical space.
297  */
298 STATIC int
299 xrep_bnobt_sort_records(
300         struct xrep_abt                 *ra)
301 {
302         struct xfs_alloc_rec_incore     arec;
303         xfarray_idx_t                   cur = XFARRAY_CURSOR_INIT;
304         xfs_agblock_t                   next_agbno = 0;
305         int                             error;
306 
307         error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
308         if (error)
309                 return error;
310 
311         while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
312                 if (arec.ar_startblock < next_agbno)
313                         return -EFSCORRUPTED;
314 
315                 next_agbno = arec.ar_startblock + arec.ar_blockcount;
316         }
317 
318         return error;
319 }
320 
321 /*
322  * Compare two free space extents by length and then block number.  We want
323  * to sort first in order of increasing length and then in order of increasing
324  * block number.
325  */
326 static int
327 xrep_cntbt_extent_cmp(
328         const void                      *a,
329         const void                      *b)
330 {
331         const struct xfs_alloc_rec_incore *ap = a;
332         const struct xfs_alloc_rec_incore *bp = b;
333 
334         if (ap->ar_blockcount > bp->ar_blockcount)
335                 return 1;
336         else if (ap->ar_blockcount < bp->ar_blockcount)
337                 return -1;
338         return xrep_bnobt_extent_cmp(a, b);
339 }
340 
341 /*
342  * Sort the free extents by length so so that we can put the records into the
343  * cntbt in the correct order.  Don't let userspace kill us if we're resorting
344  * after allocating btree blocks.
345  */
346 STATIC int
347 xrep_cntbt_sort_records(
348         struct xrep_abt                 *ra,
349         bool                            is_resort)
350 {
351         return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
352                         is_resort ? 0 : XFARRAY_SORT_KILLABLE);
353 }
354 
355 /*
356  * Iterate all reverse mappings to find (1) the gaps between rmap records (all
357  * unowned space), (2) the OWN_AG extents (which encompass the free space
358  * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
359  * blocks.  The free space is (1) + (2) - (3) - (4).
360  */
361 STATIC int
362 xrep_abt_find_freespace(
363         struct xrep_abt         *ra)
364 {
365         struct xfs_scrub        *sc = ra->sc;
366         struct xfs_mount        *mp = sc->mp;
367         struct xfs_agf          *agf = sc->sa.agf_bp->b_addr;
368         struct xfs_buf          *agfl_bp;
369         xfs_agblock_t           agend;
370         int                     error;
371 
372         xagb_bitmap_init(&ra->not_allocbt_blocks);
373 
374         xrep_ag_btcur_init(sc, &sc->sa);
375 
376         /*
377          * Iterate all the reverse mappings to find gaps in the physical
378          * mappings, all the OWN_AG blocks, and all the rmapbt extents.
379          */
380         error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
381         if (error)
382                 goto err;
383 
384         /* Insert a record for space between the last rmap and EOAG. */
385         agend = be32_to_cpu(agf->agf_length);
386         if (ra->next_agbno < agend) {
387                 error = xrep_abt_stash(ra, agend);
388                 if (error)
389                         goto err;
390         }
391 
392         /* Collect all the AGFL blocks. */
393         error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
394         if (error)
395                 goto err;
396 
397         error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
398         if (error)
399                 goto err_agfl;
400 
401         /* Compute the old bnobt/cntbt blocks. */
402         error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
403                         &ra->not_allocbt_blocks);
404         if (error)
405                 goto err_agfl;
406 
407         ra->nr_real_records = xfarray_length(ra->free_records);
408 err_agfl:
409         xfs_trans_brelse(sc->tp, agfl_bp);
410 err:
411         xchk_ag_btcur_free(&sc->sa);
412         xagb_bitmap_destroy(&ra->not_allocbt_blocks);
413         return error;
414 }
415 
416 /*
417  * We're going to use the observed free space records to reserve blocks for the
418  * new free space btrees, so we play an iterative game where we try to converge
419  * on the number of blocks we need:
420  *
421  * 1. Estimate how many blocks we'll need to store the records.
422  * 2. If the first free record has more blocks than we need, we're done.
423  *    We will have to re-sort the records prior to building the cntbt.
424  * 3. If that record has exactly the number of blocks we need, null out the
425  *    record.  We're done.
426  * 4. Otherwise, we still need more blocks.  Null out the record, subtract its
427  *    length from the number of blocks we need, and go back to step 1.
428  *
429  * Fortunately, we don't have to do any transaction work to play this game, so
430  * we don't have to tear down the staging cursors.
431  */
432 STATIC int
433 xrep_abt_reserve_space(
434         struct xrep_abt         *ra,
435         struct xfs_btree_cur    *bno_cur,
436         struct xfs_btree_cur    *cnt_cur,
437         bool                    *needs_resort)
438 {
439         struct xfs_scrub        *sc = ra->sc;
440         xfarray_idx_t           record_nr;
441         unsigned int            allocated = 0;
442         int                     error = 0;
443 
444         record_nr = xfarray_length(ra->free_records) - 1;
445         do {
446                 struct xfs_alloc_rec_incore arec;
447                 uint64_t                required;
448                 unsigned int            desired;
449                 unsigned int            len;
450 
451                 /* Compute how many blocks we'll need. */
452                 error = xfs_btree_bload_compute_geometry(cnt_cur,
453                                 &ra->new_cntbt.bload, ra->nr_real_records);
454                 if (error)
455                         break;
456 
457                 error = xfs_btree_bload_compute_geometry(bno_cur,
458                                 &ra->new_bnobt.bload, ra->nr_real_records);
459                 if (error)
460                         break;
461 
462                 /* How many btree blocks do we need to store all records? */
463                 required = ra->new_bnobt.bload.nr_blocks +
464                            ra->new_cntbt.bload.nr_blocks;
465                 ASSERT(required < INT_MAX);
466 
467                 /* If we've reserved enough blocks, we're done. */
468                 if (allocated >= required)
469                         break;
470 
471                 desired = required - allocated;
472 
473                 /* We need space but there's none left; bye! */
474                 if (ra->nr_real_records == 0) {
475                         error = -ENOSPC;
476                         break;
477                 }
478 
479                 /* Grab the first record from the list. */
480                 error = xfarray_load(ra->free_records, record_nr, &arec);
481                 if (error)
482                         break;
483 
484                 ASSERT(arec.ar_blockcount <= UINT_MAX);
485                 len = min_t(unsigned int, arec.ar_blockcount, desired);
486 
487                 trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
488                                 arec.ar_startblock, len, XFS_RMAP_OWN_AG);
489 
490                 error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
491                                 arec.ar_startblock, len);
492                 if (error)
493                         break;
494                 allocated += len;
495                 ra->nr_blocks -= len;
496 
497                 if (arec.ar_blockcount > desired) {
498                         /*
499                          * Record has more space than we need.  The number of
500                          * free records doesn't change, so shrink the free
501                          * record, inform the caller that the records are no
502                          * longer sorted by length, and exit.
503                          */
504                         arec.ar_startblock += desired;
505                         arec.ar_blockcount -= desired;
506                         error = xfarray_store(ra->free_records, record_nr,
507                                         &arec);
508                         if (error)
509                                 break;
510 
511                         *needs_resort = true;
512                         return 0;
513                 }
514 
515                 /*
516                  * We're going to use up the entire record, so unset it and
517                  * move on to the next one.  This changes the number of free
518                  * records (but doesn't break the sorting order), so we must
519                  * go around the loop once more to re-run _bload_init.
520                  */
521                 error = xfarray_unset(ra->free_records, record_nr);
522                 if (error)
523                         break;
524                 ra->nr_real_records--;
525                 record_nr--;
526         } while (1);
527 
528         return error;
529 }
530 
531 STATIC int
532 xrep_abt_dispose_one(
533         struct xrep_abt         *ra,
534         struct xrep_newbt_resv  *resv)
535 {
536         struct xfs_scrub        *sc = ra->sc;
537         struct xfs_perag        *pag = sc->sa.pag;
538         xfs_agblock_t           free_agbno = resv->agbno + resv->used;
539         xfs_extlen_t            free_aglen = resv->len - resv->used;
540         int                     error;
541 
542         ASSERT(pag == resv->pag);
543 
544         /* Add a deferred rmap for each extent we used. */
545         if (resv->used > 0)
546                 xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
547                                 resv->used, XFS_RMAP_OWN_AG);
548 
549         /*
550          * For each reserved btree block we didn't use, add it to the free
551          * space btree.  We didn't touch fdblocks when we reserved them, so
552          * we don't touch it now.
553          */
554         if (free_aglen == 0)
555                 return 0;
556 
557         trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
558                         free_aglen, ra->new_bnobt.oinfo.oi_owner);
559 
560         error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
561                         &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
562         if (error)
563                 return error;
564 
565         return xrep_defer_finish(sc);
566 }
567 
568 /*
569  * Deal with all the space we reserved.  Blocks that were allocated for the
570  * free space btrees need to have a (deferred) rmap added for the OWN_AG
571  * allocation, and blocks that didn't get used can be freed via the usual
572  * (deferred) means.
573  */
574 STATIC void
575 xrep_abt_dispose_reservations(
576         struct xrep_abt         *ra,
577         int                     error)
578 {
579         struct xrep_newbt_resv  *resv, *n;
580 
581         if (error)
582                 goto junkit;
583 
584         list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
585                 error = xrep_abt_dispose_one(ra, resv);
586                 if (error)
587                         goto junkit;
588         }
589 
590 junkit:
591         list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
592                 xfs_perag_put(resv->pag);
593                 list_del(&resv->list);
594                 kfree(resv);
595         }
596 
597         xrep_newbt_cancel(&ra->new_bnobt);
598         xrep_newbt_cancel(&ra->new_cntbt);
599 }
600 
601 /* Retrieve free space data for bulk load. */
602 STATIC int
603 xrep_abt_get_records(
604         struct xfs_btree_cur            *cur,
605         unsigned int                    idx,
606         struct xfs_btree_block          *block,
607         unsigned int                    nr_wanted,
608         void                            *priv)
609 {
610         struct xfs_alloc_rec_incore     *arec = &cur->bc_rec.a;
611         struct xrep_abt                 *ra = priv;
612         union xfs_btree_rec             *block_rec;
613         unsigned int                    loaded;
614         int                             error;
615 
616         for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
617                 error = xfarray_load_next(ra->free_records, &ra->array_cur,
618                                 arec);
619                 if (error)
620                         return error;
621 
622                 ra->longest = max(ra->longest, arec->ar_blockcount);
623 
624                 block_rec = xfs_btree_rec_addr(cur, idx, block);
625                 cur->bc_ops->init_rec_from_cur(cur, block_rec);
626         }
627 
628         return loaded;
629 }
630 
631 /* Feed one of the new btree blocks to the bulk loader. */
632 STATIC int
633 xrep_abt_claim_block(
634         struct xfs_btree_cur    *cur,
635         union xfs_btree_ptr     *ptr,
636         void                    *priv)
637 {
638         struct xrep_abt         *ra = priv;
639 
640         return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
641 }
642 
643 /*
644  * Reset the AGF counters to reflect the free space btrees that we just
645  * rebuilt, then reinitialize the per-AG data.
646  */
647 STATIC int
648 xrep_abt_reset_counters(
649         struct xrep_abt         *ra)
650 {
651         struct xfs_scrub        *sc = ra->sc;
652         struct xfs_perag        *pag = sc->sa.pag;
653         struct xfs_agf          *agf = sc->sa.agf_bp->b_addr;
654         unsigned int            freesp_btreeblks = 0;
655 
656         /*
657          * Compute the contribution to agf_btreeblks for the new free space
658          * btrees.  This is the computed btree size minus anything we didn't
659          * use.
660          */
661         freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
662         freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
663 
664         freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
665         freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
666 
667         /*
668          * The AGF header contains extra information related to the free space
669          * btrees, so we must update those fields here.
670          */
671         agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
672                                 (be32_to_cpu(agf->agf_rmap_blocks) - 1));
673         agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
674         agf->agf_longest = cpu_to_be32(ra->longest);
675         xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
676                                                  XFS_AGF_LONGEST |
677                                                  XFS_AGF_FREEBLKS);
678 
679         /*
680          * After we commit the new btree to disk, it is possible that the
681          * process to reap the old btree blocks will race with the AIL trying
682          * to checkpoint the old btree blocks into the filesystem.  If the new
683          * tree is shorter than the old one, the allocbt write verifier will
684          * fail and the AIL will shut down the filesystem.
685          *
686          * To avoid this, save the old incore btree height values as the alt
687          * height values before re-initializing the perag info from the updated
688          * AGF to capture all the new values.
689          */
690         pag->pagf_repair_bno_level = pag->pagf_bno_level;
691         pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
692 
693         /* Reinitialize with the values we just logged. */
694         return xrep_reinit_pagf(sc);
695 }
696 
697 /*
698  * Use the collected free space information to stage new free space btrees.
699  * If this is successful we'll return with the new btree root
700  * information logged to the repair transaction but not yet committed.
701  */
702 STATIC int
703 xrep_abt_build_new_trees(
704         struct xrep_abt         *ra)
705 {
706         struct xfs_scrub        *sc = ra->sc;
707         struct xfs_btree_cur    *bno_cur;
708         struct xfs_btree_cur    *cnt_cur;
709         struct xfs_perag        *pag = sc->sa.pag;
710         bool                    needs_resort = false;
711         int                     error;
712 
713         /*
714          * Sort the free extents by length so that we can set up the free space
715          * btrees in as few extents as possible.  This reduces the amount of
716          * deferred rmap / free work we have to do at the end.
717          */
718         error = xrep_cntbt_sort_records(ra, false);
719         if (error)
720                 return error;
721 
722         /*
723          * Prepare to construct the new btree by reserving disk space for the
724          * new btree and setting up all the accounting information we'll need
725          * to root the new btree while it's under construction and before we
726          * attach it to the AG header.
727          */
728         xrep_newbt_init_bare(&ra->new_bnobt, sc);
729         xrep_newbt_init_bare(&ra->new_cntbt, sc);
730 
731         ra->new_bnobt.bload.get_records = xrep_abt_get_records;
732         ra->new_cntbt.bload.get_records = xrep_abt_get_records;
733 
734         ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
735         ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
736 
737         /* Allocate cursors for the staged btrees. */
738         bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
739         xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
740 
741         cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
742         xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
743 
744         /* Last chance to abort before we start committing fixes. */
745         if (xchk_should_terminate(sc, &error))
746                 goto err_cur;
747 
748         /* Reserve the space we'll need for the new btrees. */
749         error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
750         if (error)
751                 goto err_cur;
752 
753         /*
754          * If we need to re-sort the free extents by length, do so so that we
755          * can put the records into the cntbt in the correct order.
756          */
757         if (needs_resort) {
758                 error = xrep_cntbt_sort_records(ra, needs_resort);
759                 if (error)
760                         goto err_cur;
761         }
762 
763         /*
764          * Due to btree slack factors, it's possible for a new btree to be one
765          * level taller than the old btree.  Update the alternate incore btree
766          * height so that we don't trip the verifiers when writing the new
767          * btree blocks to disk.
768          */
769         pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
770         pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
771 
772         /* Load the free space by length tree. */
773         ra->array_cur = XFARRAY_CURSOR_INIT;
774         ra->longest = 0;
775         error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
776         if (error)
777                 goto err_levels;
778 
779         error = xrep_bnobt_sort_records(ra);
780         if (error)
781                 goto err_levels;
782 
783         /* Load the free space by block number tree. */
784         ra->array_cur = XFARRAY_CURSOR_INIT;
785         error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
786         if (error)
787                 goto err_levels;
788 
789         /*
790          * Install the new btrees in the AG header.  After this point the old
791          * btrees are no longer accessible and the new trees are live.
792          */
793         xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
794         xfs_btree_del_cursor(bno_cur, 0);
795         xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
796         xfs_btree_del_cursor(cnt_cur, 0);
797 
798         /* Reset the AGF counters now that we've changed the btree shape. */
799         error = xrep_abt_reset_counters(ra);
800         if (error)
801                 goto err_newbt;
802 
803         /* Dispose of any unused blocks and the accounting information. */
804         xrep_abt_dispose_reservations(ra, error);
805 
806         return xrep_roll_ag_trans(sc);
807 
808 err_levels:
809         pag->pagf_repair_bno_level = 0;
810         pag->pagf_repair_cnt_level = 0;
811 err_cur:
812         xfs_btree_del_cursor(cnt_cur, error);
813         xfs_btree_del_cursor(bno_cur, error);
814 err_newbt:
815         xrep_abt_dispose_reservations(ra, error);
816         return error;
817 }
818 
819 /*
820  * Now that we've logged the roots of the new btrees, invalidate all of the
821  * old blocks and free them.
822  */
823 STATIC int
824 xrep_abt_remove_old_trees(
825         struct xrep_abt         *ra)
826 {
827         struct xfs_perag        *pag = ra->sc->sa.pag;
828         int                     error;
829 
830         /* Free the old btree blocks if they're not in use. */
831         error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
832                         &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
833         if (error)
834                 return error;
835 
836         /*
837          * Now that we've zapped all the old allocbt blocks we can turn off
838          * the alternate height mechanism.
839          */
840         pag->pagf_repair_bno_level = 0;
841         pag->pagf_repair_cnt_level = 0;
842         return 0;
843 }
844 
845 /* Repair the freespace btrees for some AG. */
846 int
847 xrep_allocbt(
848         struct xfs_scrub        *sc)
849 {
850         struct xrep_abt         *ra;
851         struct xfs_mount        *mp = sc->mp;
852         char                    *descr;
853         int                     error;
854 
855         /* We require the rmapbt to rebuild anything. */
856         if (!xfs_has_rmapbt(mp))
857                 return -EOPNOTSUPP;
858 
859         ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
860         if (!ra)
861                 return -ENOMEM;
862         ra->sc = sc;
863 
864         /* We rebuild both data structures. */
865         sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
866 
867         /*
868          * Make sure the busy extent list is clear because we can't put extents
869          * on there twice.  In theory we cleared this before we started, but
870          * let's not risk the filesystem.
871          */
872         if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
873                 error = -EDEADLOCK;
874                 goto out_ra;
875         }
876 
877         /* Set up enough storage to handle maximally fragmented free space. */
878         descr = xchk_xfile_ag_descr(sc, "free space records");
879         error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
880                         sizeof(struct xfs_alloc_rec_incore),
881                         &ra->free_records);
882         kfree(descr);
883         if (error)
884                 goto out_ra;
885 
886         /* Collect the free space data and find the old btree blocks. */
887         xagb_bitmap_init(&ra->old_allocbt_blocks);
888         error = xrep_abt_find_freespace(ra);
889         if (error)
890                 goto out_bitmap;
891 
892         /* Rebuild the free space information. */
893         error = xrep_abt_build_new_trees(ra);
894         if (error)
895                 goto out_bitmap;
896 
897         /* Kill the old trees. */
898         error = xrep_abt_remove_old_trees(ra);
899         if (error)
900                 goto out_bitmap;
901 
902 out_bitmap:
903         xagb_bitmap_destroy(&ra->old_allocbt_blocks);
904         xfarray_destroy(ra->free_records);
905 out_ra:
906         kfree(ra);
907         return error;
908 }
909 
910 /* Make sure both btrees are ok after we've rebuilt them. */
911 int
912 xrep_revalidate_allocbt(
913         struct xfs_scrub        *sc)
914 {
915         __u32                   old_type = sc->sm->sm_type;
916         int                     error;
917 
918         /*
919          * We must update sm_type temporarily so that the tree-to-tree cross
920          * reference checks will work in the correct direction, and also so
921          * that tracing will report correctly if there are more errors.
922          */
923         sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
924         error = xchk_allocbt(sc);
925         if (error)
926                 goto out;
927 
928         sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
929         error = xchk_allocbt(sc);
930 out:
931         sc->sm->sm_type = old_type;
932         return error;
933 }
934 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php