~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/xfs/scrub/dir_repair.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
  4  * Author: Darrick J. Wong <djwong@kernel.org>
  5  */
  6 #include "xfs.h"
  7 #include "xfs_fs.h"
  8 #include "xfs_shared.h"
  9 #include "xfs_format.h"
 10 #include "xfs_trans_resv.h"
 11 #include "xfs_mount.h"
 12 #include "xfs_defer.h"
 13 #include "xfs_bit.h"
 14 #include "xfs_log_format.h"
 15 #include "xfs_trans.h"
 16 #include "xfs_sb.h"
 17 #include "xfs_inode.h"
 18 #include "xfs_icache.h"
 19 #include "xfs_da_format.h"
 20 #include "xfs_da_btree.h"
 21 #include "xfs_dir2.h"
 22 #include "xfs_dir2_priv.h"
 23 #include "xfs_bmap.h"
 24 #include "xfs_quota.h"
 25 #include "xfs_bmap_btree.h"
 26 #include "xfs_trans_space.h"
 27 #include "xfs_bmap_util.h"
 28 #include "xfs_exchmaps.h"
 29 #include "xfs_exchrange.h"
 30 #include "xfs_ag.h"
 31 #include "xfs_parent.h"
 32 #include "scrub/xfs_scrub.h"
 33 #include "scrub/scrub.h"
 34 #include "scrub/common.h"
 35 #include "scrub/trace.h"
 36 #include "scrub/repair.h"
 37 #include "scrub/tempfile.h"
 38 #include "scrub/tempexch.h"
 39 #include "scrub/xfile.h"
 40 #include "scrub/xfarray.h"
 41 #include "scrub/xfblob.h"
 42 #include "scrub/iscan.h"
 43 #include "scrub/readdir.h"
 44 #include "scrub/reap.h"
 45 #include "scrub/findparent.h"
 46 #include "scrub/orphanage.h"
 47 #include "scrub/listxattr.h"
 48 
 49 /*
 50  * Directory Repair
 51  * ================
 52  *
 53  * We repair directories by reading the directory data blocks looking for
 54  * directory entries that look salvageable (name passes verifiers, entry points
 55  * to a valid allocated inode, etc).  Each entry worth salvaging is stashed in
 56  * memory, and the stashed entries are periodically replayed into a temporary
 57  * directory to constrain memory use.  Batching the construction of the
 58  * temporary directory in this fashion reduces lock cycling of the directory
 59  * being repaired and the temporary directory, and will later become important
 60  * for parent pointer scanning.
 61  *
 62  * If parent pointers are enabled on this filesystem, we instead reconstruct
 63  * the directory by visiting each parent pointer of each file in the filesystem
 64  * and translating the relevant parent pointer records into dirents.  In this
 65  * case, it is advantageous to stash all directory entries created from parent
 66  * pointers for a single child file before replaying them into the temporary
 67  * directory.  To save memory, the live filesystem scan reuses the findparent
 68  * fields.  Directory repair chooses either parent pointer scanning or
 69  * directory entry salvaging, but not both.
 70  *
 71  * Directory entries added to the temporary directory do not elevate the link
 72  * counts of the inodes found.  When salvaging completes, the remaining stashed
 73  * entries are replayed to the temporary directory.  An atomic mapping exchange
 74  * is used to commit the new directory blocks to the directory being repaired.
 75  * This will disrupt readdir cursors.
 76  *
 77  * Locking Issues
 78  * --------------
 79  *
 80  * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
 81  * /a/b for a "mv /a/b /c/" operation.  This means that only b's ILOCK protects
 82  * b's dotdot update.  This is in contrast to every other dotdot update (link,
 83  * remove, mkdir).  If the repair code drops the ILOCK, it must either
 84  * revalidate the dotdot entry or use dirent hooks to capture updates from
 85  * other threads.
 86  */
 87 
 88 /* Create a dirent in the tempdir. */
 89 #define XREP_DIRENT_ADD         (1)
 90 
 91 /* Remove a dirent from the tempdir. */
 92 #define XREP_DIRENT_REMOVE      (2)
 93 
 94 /* Directory entry to be restored in the new directory. */
 95 struct xrep_dirent {
 96         /* Cookie for retrieval of the dirent name. */
 97         xfblob_cookie           name_cookie;
 98 
 99         /* Target inode number. */
100         xfs_ino_t               ino;
101 
102         /* Length of the dirent name. */
103         uint8_t                 namelen;
104 
105         /* File type of the dirent. */
106         uint8_t                 ftype;
107 
108         /* XREP_DIRENT_{ADD,REMOVE} */
109         uint8_t                 action;
110 };
111 
112 /*
113  * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114  * before we write them to the temp dir.
115  */
116 #define XREP_DIR_MAX_STASH_BYTES        (PAGE_SIZE * 8)
117 
118 struct xrep_dir {
119         struct xfs_scrub        *sc;
120 
121         /* Fixed-size array of xrep_dirent structures. */
122         struct xfarray          *dir_entries;
123 
124         /* Blobs containing directory entry names. */
125         struct xfblob           *dir_names;
126 
127         /* Information for exchanging data forks at the end. */
128         struct xrep_tempexch    tx;
129 
130         /* Preallocated args struct for performing dir operations */
131         struct xfs_da_args      args;
132 
133         /*
134          * Information used to scan the filesystem to find the inumber of the
135          * dotdot entry for this directory.  For directory salvaging when
136          * parent pointers are not enabled, we use the findparent_* functions
137          * on this object and access only the parent_ino field directly.
138          *
139          * When parent pointers are enabled, however, the pptr scanner uses the
140          * iscan, hooks, lock, and parent_ino fields of this object directly.
141          * @pscan.lock coordinates access to dir_entries, dir_names,
142          * parent_ino, subdirs, dirents, and args.  This reduces the memory
143          * requirements of this structure.
144          */
145         struct xrep_parent_scan_info pscan;
146 
147         /*
148          * Context information for attaching this directory to the lost+found
149          * if this directory does not have a parent.
150          */
151         struct xrep_adoption    adoption;
152 
153         /* How many subdirectories did we find? */
154         uint64_t                subdirs;
155 
156         /* How many dirents did we find? */
157         unsigned int            dirents;
158 
159         /* Should we move this directory to the orphanage? */
160         bool                    needs_adoption;
161 
162         /* Directory entry name, plus the trailing null. */
163         struct xfs_name         xname;
164         unsigned char           namebuf[MAXNAMELEN];
165 };
166 
167 /* Tear down all the incore stuff we created. */
168 static void
169 xrep_dir_teardown(
170         struct xfs_scrub        *sc)
171 {
172         struct xrep_dir         *rd = sc->buf;
173 
174         xrep_findparent_scan_teardown(&rd->pscan);
175         xfblob_destroy(rd->dir_names);
176         xfarray_destroy(rd->dir_entries);
177 }
178 
179 /* Set up for a directory repair. */
180 int
181 xrep_setup_directory(
182         struct xfs_scrub        *sc)
183 {
184         struct xrep_dir         *rd;
185         int                     error;
186 
187         xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
188 
189         error = xrep_orphanage_try_create(sc);
190         if (error)
191                 return error;
192 
193         error = xrep_tempfile_create(sc, S_IFDIR);
194         if (error)
195                 return error;
196 
197         rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
198         if (!rd)
199                 return -ENOMEM;
200         rd->sc = sc;
201         rd->xname.name = rd->namebuf;
202         sc->buf = rd;
203 
204         return 0;
205 }
206 
207 /*
208  * Look up the dotdot entry and confirm that it's really the parent.
209  * Returns NULLFSINO if we don't know what to do.
210  */
211 static inline xfs_ino_t
212 xrep_dir_lookup_parent(
213         struct xrep_dir         *rd)
214 {
215         struct xfs_scrub        *sc = rd->sc;
216         xfs_ino_t               ino;
217         int                     error;
218 
219         error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
220         if (error)
221                 return NULLFSINO;
222         if (!xfs_verify_dir_ino(sc->mp, ino))
223                 return NULLFSINO;
224 
225         error = xrep_findparent_confirm(sc, &ino);
226         if (error)
227                 return NULLFSINO;
228 
229         return ino;
230 }
231 
232 /*
233  * Look up '..' in the dentry cache and confirm that it's really the parent.
234  * Returns NULLFSINO if the dcache misses or if the hit is implausible.
235  */
236 static inline xfs_ino_t
237 xrep_dir_dcache_parent(
238         struct xrep_dir         *rd)
239 {
240         struct xfs_scrub        *sc = rd->sc;
241         xfs_ino_t               parent_ino;
242         int                     error;
243 
244         parent_ino = xrep_findparent_from_dcache(sc);
245         if (parent_ino == NULLFSINO)
246                 return parent_ino;
247 
248         error = xrep_findparent_confirm(sc, &parent_ino);
249         if (error)
250                 return NULLFSINO;
251 
252         return parent_ino;
253 }
254 
255 /* Try to find the parent of the directory being repaired. */
256 STATIC int
257 xrep_dir_find_parent(
258         struct xrep_dir         *rd)
259 {
260         xfs_ino_t               ino;
261 
262         ino = xrep_findparent_self_reference(rd->sc);
263         if (ino != NULLFSINO) {
264                 xrep_findparent_scan_finish_early(&rd->pscan, ino);
265                 return 0;
266         }
267 
268         ino = xrep_dir_dcache_parent(rd);
269         if (ino != NULLFSINO) {
270                 xrep_findparent_scan_finish_early(&rd->pscan, ino);
271                 return 0;
272         }
273 
274         ino = xrep_dir_lookup_parent(rd);
275         if (ino != NULLFSINO) {
276                 xrep_findparent_scan_finish_early(&rd->pscan, ino);
277                 return 0;
278         }
279 
280         /*
281          * A full filesystem scan is the last resort.  On a busy filesystem,
282          * the scan can fail with -EBUSY if we cannot grab IOLOCKs.  That means
283          * that we don't know what who the parent is, so we should return to
284          * userspace.
285          */
286         return xrep_findparent_scan(&rd->pscan);
287 }
288 
289 /*
290  * Decide if we want to salvage this entry.  We don't bother with oversized
291  * names or the dot entry.
292  */
293 STATIC int
294 xrep_dir_want_salvage(
295         struct xrep_dir         *rd,
296         const char              *name,
297         int                     namelen,
298         xfs_ino_t               ino)
299 {
300         struct xfs_mount        *mp = rd->sc->mp;
301 
302         /* No pointers to ourselves or to garbage. */
303         if (ino == rd->sc->ip->i_ino)
304                 return false;
305         if (!xfs_verify_dir_ino(mp, ino))
306                 return false;
307 
308         /* No weird looking names or dot entries. */
309         if (namelen >= MAXNAMELEN || namelen <= 0)
310                 return false;
311         if (namelen == 1 && name[0] == '.')
312                 return false;
313         if (!xfs_dir2_namecheck(name, namelen))
314                 return false;
315 
316         return true;
317 }
318 
319 /*
320  * Remember that we want to create a dirent in the tempdir.  These stashed
321  * actions will be replayed later.
322  */
323 STATIC int
324 xrep_dir_stash_createname(
325         struct xrep_dir         *rd,
326         const struct xfs_name   *name,
327         xfs_ino_t               ino)
328 {
329         struct xrep_dirent      dirent = {
330                 .action         = XREP_DIRENT_ADD,
331                 .ino            = ino,
332                 .namelen        = name->len,
333                 .ftype          = name->type,
334         };
335         int                     error;
336 
337         trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
338 
339         error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
340         if (error)
341                 return error;
342 
343         return xfarray_append(rd->dir_entries, &dirent);
344 }
345 
346 /*
347  * Remember that we want to remove a dirent from the tempdir.  These stashed
348  * actions will be replayed later.
349  */
350 STATIC int
351 xrep_dir_stash_removename(
352         struct xrep_dir         *rd,
353         const struct xfs_name   *name,
354         xfs_ino_t               ino)
355 {
356         struct xrep_dirent      dirent = {
357                 .action         = XREP_DIRENT_REMOVE,
358                 .ino            = ino,
359                 .namelen        = name->len,
360                 .ftype          = name->type,
361         };
362         int                     error;
363 
364         trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
365 
366         error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
367         if (error)
368                 return error;
369 
370         return xfarray_append(rd->dir_entries, &dirent);
371 }
372 
373 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
374 STATIC int
375 xrep_dir_salvage_entry(
376         struct xrep_dir         *rd,
377         unsigned char           *name,
378         unsigned int            namelen,
379         xfs_ino_t               ino)
380 {
381         struct xfs_name         xname = {
382                 .name           = name,
383         };
384         struct xfs_scrub        *sc = rd->sc;
385         struct xfs_inode        *ip;
386         unsigned int            i = 0;
387         int                     error = 0;
388 
389         if (xchk_should_terminate(sc, &error))
390                 return error;
391 
392         /*
393          * Truncate the name to the first character that would trip namecheck.
394          * If we no longer have a name after that, ignore this entry.
395          */
396         while (i < namelen && name[i] != 0 && name[i] != '/')
397                 i++;
398         if (i == 0)
399                 return 0;
400         xname.len = i;
401 
402         /* Ignore '..' entries; we already picked the new parent. */
403         if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
404                 trace_xrep_dir_salvaged_parent(sc->ip, ino);
405                 return 0;
406         }
407 
408         trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
409 
410         /*
411          * Compute the ftype or dump the entry if we can't.  We don't lock the
412          * inode because inodes can't change type while we have a reference.
413          */
414         error = xchk_iget(sc, ino, &ip);
415         if (error)
416                 return 0;
417 
418         xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
419         xchk_irele(sc, ip);
420 
421         return xrep_dir_stash_createname(rd, &xname, ino);
422 }
423 
424 /* Record a shortform directory entry for later reinsertion. */
425 STATIC int
426 xrep_dir_salvage_sf_entry(
427         struct xrep_dir                 *rd,
428         struct xfs_dir2_sf_hdr          *sfp,
429         struct xfs_dir2_sf_entry        *sfep)
430 {
431         xfs_ino_t                       ino;
432 
433         ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
434         if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
435                 return 0;
436 
437         return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
438 }
439 
440 /* Record a regular directory entry for later reinsertion. */
441 STATIC int
442 xrep_dir_salvage_data_entry(
443         struct xrep_dir                 *rd,
444         struct xfs_dir2_data_entry      *dep)
445 {
446         xfs_ino_t                       ino;
447 
448         ino = be64_to_cpu(dep->inumber);
449         if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
450                 return 0;
451 
452         return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
453 }
454 
455 /* Try to recover block/data format directory entries. */
456 STATIC int
457 xrep_dir_recover_data(
458         struct xrep_dir         *rd,
459         struct xfs_buf          *bp)
460 {
461         struct xfs_da_geometry  *geo = rd->sc->mp->m_dir_geo;
462         unsigned int            offset;
463         unsigned int            end;
464         int                     error = 0;
465 
466         /*
467          * Loop over the data portion of the block.
468          * Each object is a real entry (dep) or an unused one (dup).
469          */
470         offset = geo->data_entry_offset;
471         end = min_t(unsigned int, BBTOB(bp->b_length),
472                         xfs_dir3_data_end_offset(geo, bp->b_addr));
473 
474         while (offset < end) {
475                 struct xfs_dir2_data_unused     *dup = bp->b_addr + offset;
476                 struct xfs_dir2_data_entry      *dep = bp->b_addr + offset;
477 
478                 if (xchk_should_terminate(rd->sc, &error))
479                         return error;
480 
481                 /* Skip unused entries. */
482                 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
483                         offset += be16_to_cpu(dup->length);
484                         continue;
485                 }
486 
487                 /* Don't walk off the end of the block. */
488                 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
489                 if (offset > end)
490                         break;
491 
492                 /* Ok, let's save this entry. */
493                 error = xrep_dir_salvage_data_entry(rd, dep);
494                 if (error)
495                         return error;
496 
497         }
498 
499         return 0;
500 }
501 
502 /* Try to recover shortform directory entries. */
503 STATIC int
504 xrep_dir_recover_sf(
505         struct xrep_dir                 *rd)
506 {
507         struct xfs_dir2_sf_hdr          *hdr;
508         struct xfs_dir2_sf_entry        *sfep;
509         struct xfs_dir2_sf_entry        *next;
510         struct xfs_ifork                *ifp;
511         xfs_ino_t                       ino;
512         unsigned char                   *end;
513         int                             error = 0;
514 
515         ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
516         hdr = ifp->if_data;
517         end = (unsigned char *)ifp->if_data + ifp->if_bytes;
518 
519         ino = xfs_dir2_sf_get_parent_ino(hdr);
520         trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
521 
522         sfep = xfs_dir2_sf_firstentry(hdr);
523         while ((unsigned char *)sfep < end) {
524                 if (xchk_should_terminate(rd->sc, &error))
525                         return error;
526 
527                 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
528                 if ((unsigned char *)next > end)
529                         break;
530 
531                 /* Ok, let's save this entry. */
532                 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
533                 if (error)
534                         return error;
535 
536                 sfep = next;
537         }
538 
539         return 0;
540 }
541 
542 /*
543  * Try to figure out the format of this directory from the data fork mappings
544  * and the directory size.  If we can be reasonably sure of format, we can be
545  * more aggressive in salvaging directory entries.  On return, @magic_guess
546  * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
547  * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
548  * and 0 if we can't tell.
549  */
550 STATIC void
551 xrep_dir_guess_format(
552         struct xrep_dir         *rd,
553         __be32                  *magic_guess)
554 {
555         struct xfs_inode        *dp = rd->sc->ip;
556         struct xfs_mount        *mp = rd->sc->mp;
557         struct xfs_da_geometry  *geo = mp->m_dir_geo;
558         xfs_fileoff_t           last;
559         int                     error;
560 
561         ASSERT(xfs_has_crc(mp));
562 
563         *magic_guess = 0;
564 
565         /*
566          * If there's a single directory block and the directory size is
567          * exactly one block, this has to be a single block format directory.
568          */
569         error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
570         if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
571             dp->i_disk_size == geo->blksize) {
572                 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
573                 return;
574         }
575 
576         /*
577          * If the last extent before the leaf offset matches the directory
578          * size and the directory size is larger than 1 block, this is a
579          * data format directory.
580          */
581         last = geo->leafblk;
582         error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
583         if (!error &&
584             XFS_FSB_TO_B(mp, last) > geo->blksize &&
585             XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
586                 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
587                 return;
588         }
589 }
590 
591 /* Recover directory entries from a specific directory block. */
592 STATIC int
593 xrep_dir_recover_dirblock(
594         struct xrep_dir         *rd,
595         __be32                  magic_guess,
596         xfs_dablk_t             dabno)
597 {
598         struct xfs_dir2_data_hdr *hdr;
599         struct xfs_buf          *bp;
600         __be32                  oldmagic;
601         int                     error;
602 
603         /*
604          * Try to read buffer.  We invalidate them in the next step so we don't
605          * bother to set a buffer type or ops.
606          */
607         error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
608                         XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
609         if (error || !bp)
610                 return error;
611 
612         hdr = bp->b_addr;
613         oldmagic = hdr->magic;
614 
615         trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
616                         be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
617 
618         /*
619          * If we're sure of the block's format, proceed with the salvage
620          * operation using the specified magic number.
621          */
622         if (magic_guess) {
623                 hdr->magic = magic_guess;
624                 goto recover;
625         }
626 
627         /*
628          * If we couldn't guess what type of directory this is, then we will
629          * only salvage entries from directory blocks that match the magic
630          * number and pass verifiers.
631          */
632         switch (hdr->magic) {
633         case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
634         case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
635                 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
636                         goto out;
637                 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
638                         goto out;
639                 break;
640         case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
641         case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
642                 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
643                         goto out;
644                 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
645                         goto out;
646                 break;
647         default:
648                 goto out;
649         }
650 
651 recover:
652         error = xrep_dir_recover_data(rd, bp);
653 
654 out:
655         hdr->magic = oldmagic;
656         xfs_trans_brelse(rd->sc->tp, bp);
657         return error;
658 }
659 
660 static inline void
661 xrep_dir_init_args(
662         struct xrep_dir         *rd,
663         struct xfs_inode        *dp,
664         const struct xfs_name   *name)
665 {
666         memset(&rd->args, 0, sizeof(struct xfs_da_args));
667         rd->args.geo = rd->sc->mp->m_dir_geo;
668         rd->args.whichfork = XFS_DATA_FORK;
669         rd->args.owner = rd->sc->ip->i_ino;
670         rd->args.trans = rd->sc->tp;
671         rd->args.dp = dp;
672         if (!name)
673                 return;
674         rd->args.name = name->name;
675         rd->args.namelen = name->len;
676         rd->args.filetype = name->type;
677         rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
678 }
679 
680 /* Replay a stashed createname into the temporary directory. */
681 STATIC int
682 xrep_dir_replay_createname(
683         struct xrep_dir         *rd,
684         const struct xfs_name   *name,
685         xfs_ino_t               inum,
686         xfs_extlen_t            total)
687 {
688         struct xfs_scrub        *sc = rd->sc;
689         struct xfs_inode        *dp = rd->sc->tempip;
690         int                     error;
691 
692         ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
693 
694         error = xfs_dir_ino_validate(sc->mp, inum);
695         if (error)
696                 return error;
697 
698         trace_xrep_dir_replay_createname(dp, name, inum);
699 
700         xrep_dir_init_args(rd, dp, name);
701         rd->args.inumber = inum;
702         rd->args.total = total;
703         rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
704         return xfs_dir_createname_args(&rd->args);
705 }
706 
707 /* Replay a stashed removename onto the temporary directory. */
708 STATIC int
709 xrep_dir_replay_removename(
710         struct xrep_dir         *rd,
711         const struct xfs_name   *name,
712         xfs_extlen_t            total)
713 {
714         struct xfs_inode        *dp = rd->args.dp;
715 
716         ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
717 
718         xrep_dir_init_args(rd, dp, name);
719         rd->args.op_flags = 0;
720         rd->args.total = total;
721 
722         trace_xrep_dir_replay_removename(dp, name, 0);
723         return xfs_dir_removename_args(&rd->args);
724 }
725 
726 /*
727  * Add this stashed incore directory entry to the temporary directory.
728  * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
729  * must not be in transaction context.
730  */
731 STATIC int
732 xrep_dir_replay_update(
733         struct xrep_dir                 *rd,
734         const struct xfs_name           *xname,
735         const struct xrep_dirent        *dirent)
736 {
737         struct xfs_mount                *mp = rd->sc->mp;
738 #ifdef DEBUG
739         xfs_ino_t                       ino;
740 #endif
741         uint                            resblks;
742         int                             error;
743 
744         resblks = xfs_link_space_res(mp, xname->len);
745         error = xchk_trans_alloc(rd->sc, resblks);
746         if (error)
747                 return error;
748 
749         /* Lock the temporary directory and join it to the transaction */
750         xrep_tempfile_ilock(rd->sc);
751         xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
752 
753         switch (dirent->action) {
754         case XREP_DIRENT_ADD:
755                 /*
756                  * Create a replacement dirent in the temporary directory.
757                  * Note that _createname doesn't check for existing entries.
758                  * There shouldn't be any in the temporary dir, but we'll
759                  * verify this in debug mode.
760                  */
761 #ifdef DEBUG
762                 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
763                 if (error != -ENOENT) {
764                         ASSERT(error != -ENOENT);
765                         goto out_cancel;
766                 }
767 #endif
768 
769                 error = xrep_dir_replay_createname(rd, xname, dirent->ino,
770                                 resblks);
771                 if (error)
772                         goto out_cancel;
773 
774                 if (xname->type == XFS_DIR3_FT_DIR)
775                         rd->subdirs++;
776                 rd->dirents++;
777                 break;
778         case XREP_DIRENT_REMOVE:
779                 /*
780                  * Remove a dirent from the temporary directory.  Note that
781                  * _removename doesn't check the inode target of the exist
782                  * entry.  There should be a perfect match in the temporary
783                  * dir, but we'll verify this in debug mode.
784                  */
785 #ifdef DEBUG
786                 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
787                 if (error) {
788                         ASSERT(error != 0);
789                         goto out_cancel;
790                 }
791                 if (ino != dirent->ino) {
792                         ASSERT(ino == dirent->ino);
793                         error = -EIO;
794                         goto out_cancel;
795                 }
796 #endif
797 
798                 error = xrep_dir_replay_removename(rd, xname, resblks);
799                 if (error)
800                         goto out_cancel;
801 
802                 if (xname->type == XFS_DIR3_FT_DIR)
803                         rd->subdirs--;
804                 rd->dirents--;
805                 break;
806         default:
807                 ASSERT(0);
808                 error = -EIO;
809                 goto out_cancel;
810         }
811 
812         /* Commit and unlock. */
813         error = xrep_trans_commit(rd->sc);
814         if (error)
815                 return error;
816 
817         xrep_tempfile_iunlock(rd->sc);
818         return 0;
819 out_cancel:
820         xchk_trans_cancel(rd->sc);
821         xrep_tempfile_iunlock(rd->sc);
822         return error;
823 }
824 
825 /*
826  * Flush stashed incore dirent updates that have been recorded by the scanner.
827  * This is done to reduce the memory requirements of the directory rebuild,
828  * since directories can contain up to 32GB of directory data.
829  *
830  * Caller must not hold transactions or ILOCKs.  Caller must hold the tempdir
831  * IOLOCK.
832  */
833 STATIC int
834 xrep_dir_replay_updates(
835         struct xrep_dir         *rd)
836 {
837         xfarray_idx_t           array_cur;
838         int                     error;
839 
840         /* Add all the salvaged dirents to the temporary directory. */
841         mutex_lock(&rd->pscan.lock);
842         foreach_xfarray_idx(rd->dir_entries, array_cur) {
843                 struct xrep_dirent      dirent;
844 
845                 error = xfarray_load(rd->dir_entries, array_cur, &dirent);
846                 if (error)
847                         goto out_unlock;
848 
849                 error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
850                                 &rd->xname, dirent.namelen);
851                 if (error)
852                         goto out_unlock;
853                 rd->xname.type = dirent.ftype;
854                 mutex_unlock(&rd->pscan.lock);
855 
856                 error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
857                 if (error)
858                         return error;
859                 mutex_lock(&rd->pscan.lock);
860         }
861 
862         /* Empty out both arrays now that we've added the entries. */
863         xfarray_truncate(rd->dir_entries);
864         xfblob_truncate(rd->dir_names);
865         mutex_unlock(&rd->pscan.lock);
866         return 0;
867 out_unlock:
868         mutex_unlock(&rd->pscan.lock);
869         return error;
870 }
871 
872 /*
873  * Periodically flush stashed directory entries to the temporary dir.  This
874  * is done to reduce the memory requirements of the directory rebuild, since
875  * directories can contain up to 32GB of directory data.
876  */
877 STATIC int
878 xrep_dir_flush_stashed(
879         struct xrep_dir         *rd)
880 {
881         int                     error;
882 
883         /*
884          * Entering this function, the scrub context has a reference to the
885          * inode being repaired, the temporary file, and a scrub transaction
886          * that we use during dirent salvaging to avoid livelocking if there
887          * are cycles in the directory structures.  We hold ILOCK_EXCL on both
888          * the inode being repaired and the temporary file, though they are
889          * not ijoined to the scrub transaction.
890          *
891          * To constrain kernel memory use, we occasionally write salvaged
892          * dirents from the xfarray and xfblob structures into the temporary
893          * directory in preparation for exchanging the directory structures at
894          * the end.  Updating the temporary file requires a transaction, so we
895          * commit the scrub transaction and drop the two ILOCKs so that
896          * we can allocate whatever transaction we want.
897          *
898          * We still hold IOLOCK_EXCL on the inode being repaired, which
899          * prevents anyone from accessing the damaged directory data while we
900          * repair it.
901          */
902         error = xrep_trans_commit(rd->sc);
903         if (error)
904                 return error;
905         xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
906 
907         /*
908          * Take the IOLOCK of the temporary file while we modify dirents.  This
909          * isn't strictly required because the temporary file is never revealed
910          * to userspace, but we follow the same locking rules.  We still hold
911          * sc->ip's IOLOCK.
912          */
913         error = xrep_tempfile_iolock_polled(rd->sc);
914         if (error)
915                 return error;
916 
917         /* Write to the tempdir all the updates that we've stashed. */
918         error = xrep_dir_replay_updates(rd);
919         xrep_tempfile_iounlock(rd->sc);
920         if (error)
921                 return error;
922 
923         /*
924          * Recreate the salvage transaction and relock the dir we're salvaging.
925          */
926         error = xchk_trans_alloc(rd->sc, 0);
927         if (error)
928                 return error;
929         xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
930         return 0;
931 }
932 
933 /* Decide if we've stashed too much dirent data in memory. */
934 static inline bool
935 xrep_dir_want_flush_stashed(
936         struct xrep_dir         *rd)
937 {
938         unsigned long long      bytes;
939 
940         bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
941         return bytes > XREP_DIR_MAX_STASH_BYTES;
942 }
943 
944 /* Extract as many directory entries as we can. */
945 STATIC int
946 xrep_dir_recover(
947         struct xrep_dir         *rd)
948 {
949         struct xfs_bmbt_irec    got;
950         struct xfs_scrub        *sc = rd->sc;
951         struct xfs_da_geometry  *geo = sc->mp->m_dir_geo;
952         xfs_fileoff_t           offset;
953         xfs_dablk_t             dabno;
954         __be32                  magic_guess;
955         int                     nmap;
956         int                     error;
957 
958         xrep_dir_guess_format(rd, &magic_guess);
959 
960         /* Iterate each directory data block in the data fork. */
961         for (offset = 0;
962              offset < geo->leafblk;
963              offset = got.br_startoff + got.br_blockcount) {
964                 nmap = 1;
965                 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
966                                 &got, &nmap, 0);
967                 if (error)
968                         return error;
969                 if (nmap != 1)
970                         return -EFSCORRUPTED;
971                 if (!xfs_bmap_is_written_extent(&got))
972                         continue;
973 
974                 for (dabno = round_up(got.br_startoff, geo->fsbcount);
975                      dabno < got.br_startoff + got.br_blockcount;
976                      dabno += geo->fsbcount) {
977                         if (xchk_should_terminate(rd->sc, &error))
978                                 return error;
979 
980                         error = xrep_dir_recover_dirblock(rd,
981                                         magic_guess, dabno);
982                         if (error)
983                                 return error;
984 
985                         /* Flush dirents to constrain memory usage. */
986                         if (xrep_dir_want_flush_stashed(rd)) {
987                                 error = xrep_dir_flush_stashed(rd);
988                                 if (error)
989                                         return error;
990                         }
991                 }
992         }
993 
994         return 0;
995 }
996 
997 /*
998  * Find all the directory entries for this inode by scraping them out of the
999  * directory leaf blocks by hand, and flushing them into the temp dir.
1000  */
1001 STATIC int
1002 xrep_dir_find_entries(
1003         struct xrep_dir         *rd)
1004 {
1005         struct xfs_inode        *dp = rd->sc->ip;
1006         int                     error;
1007 
1008         /*
1009          * Salvage directory entries from the old directory, and write them to
1010          * the temporary directory.
1011          */
1012         if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1013                 error = xrep_dir_recover_sf(rd);
1014         } else {
1015                 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1016                 if (error)
1017                         return error;
1018 
1019                 error = xrep_dir_recover(rd);
1020         }
1021         if (error)
1022                 return error;
1023 
1024         return xrep_dir_flush_stashed(rd);
1025 }
1026 
1027 /* Scan all files in the filesystem for dirents. */
1028 STATIC int
1029 xrep_dir_salvage_entries(
1030         struct xrep_dir         *rd)
1031 {
1032         struct xfs_scrub        *sc = rd->sc;
1033         int                     error;
1034 
1035         /*
1036          * Drop the ILOCK on this directory so that we can scan for this
1037          * directory's parent.  Figure out who is going to be the parent of
1038          * this directory, then retake the ILOCK so that we can salvage
1039          * directory entries.
1040          */
1041         xchk_iunlock(sc, XFS_ILOCK_EXCL);
1042         error = xrep_dir_find_parent(rd);
1043         xchk_ilock(sc, XFS_ILOCK_EXCL);
1044         if (error)
1045                 return error;
1046 
1047         /*
1048          * Collect directory entries by parsing raw leaf blocks to salvage
1049          * whatever we can.  When we're done, free the staging memory before
1050          * exchanging the directories to reduce memory usage.
1051          */
1052         error = xrep_dir_find_entries(rd);
1053         if (error)
1054                 return error;
1055 
1056         /*
1057          * Cancel the repair transaction and drop the ILOCK so that we can
1058          * (later) use the atomic mapping exchange functions to compute the
1059          * correct block reservations and re-lock the inodes.
1060          *
1061          * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1062          * modifications, but there's nothing to prevent userspace from reading
1063          * the directory until we're ready for the exchange operation.  Reads
1064          * will return -EIO without shutting down the fs, so we're ok with
1065          * that.
1066          *
1067          * The VFS can change dotdot on us, but the findparent scan will keep
1068          * our incore parent inode up to date.  See the note on locking issues
1069          * for more details.
1070          */
1071         error = xrep_trans_commit(sc);
1072         if (error)
1073                 return error;
1074 
1075         xchk_iunlock(sc, XFS_ILOCK_EXCL);
1076         return 0;
1077 }
1078 
1079 
1080 /*
1081  * Examine a parent pointer of a file.  If it leads us back to the directory
1082  * that we're rebuilding, create an incore dirent from the parent pointer and
1083  * stash it.
1084  */
1085 STATIC int
1086 xrep_dir_scan_pptr(
1087         struct xfs_scrub                *sc,
1088         struct xfs_inode                *ip,
1089         unsigned int                    attr_flags,
1090         const unsigned char             *name,
1091         unsigned int                    namelen,
1092         const void                      *value,
1093         unsigned int                    valuelen,
1094         void                            *priv)
1095 {
1096         struct xfs_name                 xname = {
1097                 .name                   = name,
1098                 .len                    = namelen,
1099                 .type                   = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1100         };
1101         xfs_ino_t                       parent_ino;
1102         uint32_t                        parent_gen;
1103         struct xrep_dir                 *rd = priv;
1104         int                             error;
1105 
1106         if (!(attr_flags & XFS_ATTR_PARENT))
1107                 return 0;
1108 
1109         /*
1110          * Ignore parent pointers that point back to a different dir, list the
1111          * wrong generation number, or are invalid.
1112          */
1113         error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1114                         valuelen, &parent_ino, &parent_gen);
1115         if (error)
1116                 return error;
1117 
1118         if (parent_ino != sc->ip->i_ino ||
1119             parent_gen != VFS_I(sc->ip)->i_generation)
1120                 return 0;
1121 
1122         mutex_lock(&rd->pscan.lock);
1123         error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1124         mutex_unlock(&rd->pscan.lock);
1125         return error;
1126 }
1127 
1128 /*
1129  * If this child dirent points to the directory being repaired, remember that
1130  * fact so that we can reset the dotdot entry if necessary.
1131  */
1132 STATIC int
1133 xrep_dir_scan_dirent(
1134         struct xfs_scrub        *sc,
1135         struct xfs_inode        *dp,
1136         xfs_dir2_dataptr_t      dapos,
1137         const struct xfs_name   *name,
1138         xfs_ino_t               ino,
1139         void                    *priv)
1140 {
1141         struct xrep_dir         *rd = priv;
1142 
1143         /* Dirent doesn't point to this directory. */
1144         if (ino != rd->sc->ip->i_ino)
1145                 return 0;
1146 
1147         /* Ignore garbage inum. */
1148         if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1149                 return 0;
1150 
1151         /* No weird looking names. */
1152         if (name->len >= MAXNAMELEN || name->len <= 0)
1153                 return 0;
1154 
1155         /* Don't pick up dot or dotdot entries; we only want child dirents. */
1156         if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1157             xfs_dir2_samename(name, &xfs_name_dot))
1158                 return 0;
1159 
1160         trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1161                         dp->i_ino);
1162 
1163         xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1164         return 0;
1165 }
1166 
1167 /*
1168  * Decide if we want to look for child dirents or parent pointers in this file.
1169  * Skip the dir being repaired and any files being used to stage repairs.
1170  */
1171 static inline bool
1172 xrep_dir_want_scan(
1173         struct xrep_dir         *rd,
1174         const struct xfs_inode  *ip)
1175 {
1176         return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1177 }
1178 
1179 /*
1180  * Take ILOCK on a file that we want to scan.
1181  *
1182  * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1183  * has an unloaded attr bmbt.  Otherwise, take ILOCK_SHARED.
1184  */
1185 static inline unsigned int
1186 xrep_dir_scan_ilock(
1187         struct xrep_dir         *rd,
1188         struct xfs_inode        *ip)
1189 {
1190         uint                    lock_mode = XFS_ILOCK_SHARED;
1191 
1192         /* Need to take the shared ILOCK to advance the iscan cursor. */
1193         if (!xrep_dir_want_scan(rd, ip))
1194                 goto lock;
1195 
1196         if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1197                 lock_mode = XFS_ILOCK_EXCL;
1198                 goto lock;
1199         }
1200 
1201         if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1202                 lock_mode = XFS_ILOCK_EXCL;
1203 
1204 lock:
1205         xfs_ilock(ip, lock_mode);
1206         return lock_mode;
1207 }
1208 
1209 /*
1210  * Scan this file for relevant child dirents or parent pointers that point to
1211  * the directory we're rebuilding.
1212  */
1213 STATIC int
1214 xrep_dir_scan_file(
1215         struct xrep_dir         *rd,
1216         struct xfs_inode        *ip)
1217 {
1218         unsigned int            lock_mode;
1219         int                     error = 0;
1220 
1221         lock_mode = xrep_dir_scan_ilock(rd, ip);
1222 
1223         if (!xrep_dir_want_scan(rd, ip))
1224                 goto scan_done;
1225 
1226         /*
1227          * If the extended attributes look as though they has been zapped by
1228          * the inode record repair code, we cannot scan for parent pointers.
1229          */
1230         if (xchk_pptr_looks_zapped(ip)) {
1231                 error = -EBUSY;
1232                 goto scan_done;
1233         }
1234 
1235         error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1236         if (error)
1237                 goto scan_done;
1238 
1239         if (S_ISDIR(VFS_I(ip)->i_mode)) {
1240                 /*
1241                  * If the directory looks as though it has been zapped by the
1242                  * inode record repair code, we cannot scan for child dirents.
1243                  */
1244                 if (xchk_dir_looks_zapped(ip)) {
1245                         error = -EBUSY;
1246                         goto scan_done;
1247                 }
1248 
1249                 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1250                 if (error)
1251                         goto scan_done;
1252         }
1253 
1254 scan_done:
1255         xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1256         xfs_iunlock(ip, lock_mode);
1257         return error;
1258 }
1259 
1260 /*
1261  * Scan all files in the filesystem for parent pointers that we can turn into
1262  * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1263  */
1264 STATIC int
1265 xrep_dir_scan_dirtree(
1266         struct xrep_dir         *rd)
1267 {
1268         struct xfs_scrub        *sc = rd->sc;
1269         struct xfs_inode        *ip;
1270         int                     error;
1271 
1272         /* Roots of directory trees are their own parents. */
1273         if (sc->ip == sc->mp->m_rootip)
1274                 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1275 
1276         /*
1277          * Filesystem scans are time consuming.  Drop the directory ILOCK and
1278          * all other resources for the duration of the scan and hope for the
1279          * best.  The live update hooks will keep our scan information up to
1280          * date even though we've dropped the locks.
1281          */
1282         xchk_trans_cancel(sc);
1283         if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1284                 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1285                                                     XFS_ILOCK_EXCL));
1286         error = xchk_trans_alloc_empty(sc);
1287         if (error)
1288                 return error;
1289 
1290         while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1291                 bool            flush;
1292 
1293                 error = xrep_dir_scan_file(rd, ip);
1294                 xchk_irele(sc, ip);
1295                 if (error)
1296                         break;
1297 
1298                 /* Flush stashed dirent updates to constrain memory usage. */
1299                 mutex_lock(&rd->pscan.lock);
1300                 flush = xrep_dir_want_flush_stashed(rd);
1301                 mutex_unlock(&rd->pscan.lock);
1302                 if (flush) {
1303                         xchk_trans_cancel(sc);
1304 
1305                         error = xrep_tempfile_iolock_polled(sc);
1306                         if (error)
1307                                 break;
1308 
1309                         error = xrep_dir_replay_updates(rd);
1310                         xrep_tempfile_iounlock(sc);
1311                         if (error)
1312                                 break;
1313 
1314                         error = xchk_trans_alloc_empty(sc);
1315                         if (error)
1316                                 break;
1317                 }
1318 
1319                 if (xchk_should_terminate(sc, &error))
1320                         break;
1321         }
1322         xchk_iscan_iter_finish(&rd->pscan.iscan);
1323         if (error) {
1324                 /*
1325                  * If we couldn't grab an inode that was busy with a state
1326                  * change, change the error code so that we exit to userspace
1327                  * as quickly as possible.
1328                  */
1329                 if (error == -EBUSY)
1330                         return -ECANCELED;
1331                 return error;
1332         }
1333 
1334         /*
1335          * Cancel the empty transaction so that we can (later) use the atomic
1336          * file mapping exchange functions to lock files and commit the new
1337          * directory.
1338          */
1339         xchk_trans_cancel(rd->sc);
1340         return 0;
1341 }
1342 
1343 /*
1344  * Capture dirent updates being made by other threads which are relevant to the
1345  * directory being repaired.
1346  */
1347 STATIC int
1348 xrep_dir_live_update(
1349         struct notifier_block           *nb,
1350         unsigned long                   action,
1351         void                            *data)
1352 {
1353         struct xfs_dir_update_params    *p = data;
1354         struct xrep_dir                 *rd;
1355         struct xfs_scrub                *sc;
1356         int                             error = 0;
1357 
1358         rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1359         sc = rd->sc;
1360 
1361         /*
1362          * This thread updated a child dirent in the directory that we're
1363          * rebuilding.  Stash the update for replay against the temporary
1364          * directory.
1365          */
1366         if (p->dp->i_ino == sc->ip->i_ino &&
1367             xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1368                 mutex_lock(&rd->pscan.lock);
1369                 if (p->delta > 0)
1370                         error = xrep_dir_stash_createname(rd, p->name,
1371                                         p->ip->i_ino);
1372                 else
1373                         error = xrep_dir_stash_removename(rd, p->name,
1374                                         p->ip->i_ino);
1375                 mutex_unlock(&rd->pscan.lock);
1376                 if (error)
1377                         goto out_abort;
1378         }
1379 
1380         /*
1381          * This thread updated another directory's child dirent that points to
1382          * the directory that we're rebuilding, so remember the new dotdot
1383          * target.
1384          */
1385         if (p->ip->i_ino == sc->ip->i_ino &&
1386             xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1387                 if (p->delta > 0) {
1388                         trace_xrep_dir_stash_createname(sc->tempip,
1389                                         &xfs_name_dotdot,
1390                                         p->dp->i_ino);
1391 
1392                         xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1393                 } else {
1394                         trace_xrep_dir_stash_removename(sc->tempip,
1395                                         &xfs_name_dotdot,
1396                                         rd->pscan.parent_ino);
1397 
1398                         xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1399                 }
1400         }
1401 
1402         return NOTIFY_DONE;
1403 out_abort:
1404         xchk_iscan_abort(&rd->pscan.iscan);
1405         return NOTIFY_DONE;
1406 }
1407 
1408 /*
1409  * Free all the directory blocks and reset the data fork.  The caller must
1410  * join the inode to the transaction.  This function returns with the inode
1411  * joined to a clean scrub transaction.
1412  */
1413 STATIC int
1414 xrep_dir_reset_fork(
1415         struct xrep_dir         *rd,
1416         xfs_ino_t               parent_ino)
1417 {
1418         struct xfs_scrub        *sc = rd->sc;
1419         struct xfs_ifork        *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1420         int                     error;
1421 
1422         /* Unmap all the directory buffers. */
1423         if (xfs_ifork_has_extents(ifp)) {
1424                 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1425                 if (error)
1426                         return error;
1427         }
1428 
1429         trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1430 
1431         /* Reset the data fork to an empty data fork. */
1432         xfs_idestroy_fork(ifp);
1433         ifp->if_bytes = 0;
1434         sc->tempip->i_disk_size = 0;
1435 
1436         /* Reinitialize the short form directory. */
1437         xrep_dir_init_args(rd, sc->tempip, NULL);
1438         return xfs_dir2_sf_create(&rd->args, parent_ino);
1439 }
1440 
1441 /*
1442  * Prepare both inodes' directory forks for exchanging mappings.  Promote the
1443  * tempfile from short format to leaf format, and if the file being repaired
1444  * has a short format data fork, turn it into an empty extent list.
1445  */
1446 STATIC int
1447 xrep_dir_swap_prep(
1448         struct xfs_scrub        *sc,
1449         bool                    temp_local,
1450         bool                    ip_local)
1451 {
1452         int                     error;
1453 
1454         /*
1455          * If the tempfile's directory is in shortform format, convert that to
1456          * a single leaf extent so that we can use the atomic mapping exchange.
1457          */
1458         if (temp_local) {
1459                 struct xfs_da_args      args = {
1460                         .dp             = sc->tempip,
1461                         .geo            = sc->mp->m_dir_geo,
1462                         .whichfork      = XFS_DATA_FORK,
1463                         .trans          = sc->tp,
1464                         .total          = 1,
1465                         .owner          = sc->ip->i_ino,
1466                 };
1467 
1468                 error = xfs_dir2_sf_to_block(&args);
1469                 if (error)
1470                         return error;
1471 
1472                 /*
1473                  * Roll the deferred log items to get us back to a clean
1474                  * transaction.
1475                  */
1476                 error = xfs_defer_finish(&sc->tp);
1477                 if (error)
1478                         return error;
1479         }
1480 
1481         /*
1482          * If the file being repaired had a shortform data fork, convert that
1483          * to an empty extent list in preparation for the atomic mapping
1484          * exchange.
1485          */
1486         if (ip_local) {
1487                 struct xfs_ifork        *ifp;
1488 
1489                 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1490                 xfs_idestroy_fork(ifp);
1491                 ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1492                 ifp->if_nextents = 0;
1493                 ifp->if_bytes = 0;
1494                 ifp->if_data = NULL;
1495                 ifp->if_height = 0;
1496 
1497                 xfs_trans_log_inode(sc->tp, sc->ip,
1498                                 XFS_ILOG_CORE | XFS_ILOG_DDATA);
1499         }
1500 
1501         return 0;
1502 }
1503 
1504 /*
1505  * Replace the inode number of a directory entry.
1506  */
1507 static int
1508 xrep_dir_replace(
1509         struct xrep_dir         *rd,
1510         struct xfs_inode        *dp,
1511         const struct xfs_name   *name,
1512         xfs_ino_t               inum,
1513         xfs_extlen_t            total)
1514 {
1515         struct xfs_scrub        *sc = rd->sc;
1516         int                     error;
1517 
1518         ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1519 
1520         error = xfs_dir_ino_validate(sc->mp, inum);
1521         if (error)
1522                 return error;
1523 
1524         xrep_dir_init_args(rd, dp, name);
1525         rd->args.inumber = inum;
1526         rd->args.total = total;
1527         return xfs_dir_replace_args(&rd->args);
1528 }
1529 
1530 /*
1531  * Reset the link count of this directory and adjust the unlinked list pointers
1532  * as needed.
1533  */
1534 STATIC int
1535 xrep_dir_set_nlink(
1536         struct xrep_dir         *rd)
1537 {
1538         struct xfs_scrub        *sc = rd->sc;
1539         struct xfs_inode        *dp = sc->ip;
1540         struct xfs_perag        *pag;
1541         unsigned int            new_nlink = min_t(unsigned long long,
1542                                                   rd->subdirs + 2,
1543                                                   XFS_NLINK_PINNED);
1544         int                     error;
1545 
1546         /*
1547          * The directory is not on the incore unlinked list, which means that
1548          * it needs to be reachable via the directory tree.  Update the nlink
1549          * with our observed link count.  If the directory has no parent, it
1550          * will be moved to the orphanage.
1551          */
1552         if (!xfs_inode_on_unlinked_list(dp))
1553                 goto reset_nlink;
1554 
1555         /*
1556          * The directory is on the unlinked list and we did not find any
1557          * dirents.  Set the link count to zero and let the directory
1558          * inactivate when the last reference drops.
1559          */
1560         if (rd->dirents == 0) {
1561                 rd->needs_adoption = false;
1562                 new_nlink = 0;
1563                 goto reset_nlink;
1564         }
1565 
1566         /*
1567          * The directory is on the unlinked list and we found dirents.  This
1568          * directory needs to be reachable via the directory tree.  Remove the
1569          * dir from the unlinked list and update nlink with the observed link
1570          * count.  If the directory has no parent, it will be moved to the
1571          * orphanage.
1572          */
1573         pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1574         if (!pag) {
1575                 ASSERT(0);
1576                 return -EFSCORRUPTED;
1577         }
1578 
1579         error = xfs_iunlink_remove(sc->tp, pag, dp);
1580         xfs_perag_put(pag);
1581         if (error)
1582                 return error;
1583 
1584 reset_nlink:
1585         if (VFS_I(dp)->i_nlink != new_nlink)
1586                 set_nlink(VFS_I(dp), new_nlink);
1587         return 0;
1588 }
1589 
1590 /*
1591  * Finish replaying stashed dirent updates, allocate a transaction for
1592  * exchanging data fork mappings, and take the ILOCKs of both directories
1593  * before we commit the new directory structure.
1594  */
1595 STATIC int
1596 xrep_dir_finalize_tempdir(
1597         struct xrep_dir         *rd)
1598 {
1599         struct xfs_scrub        *sc = rd->sc;
1600         int                     error;
1601 
1602         if (!xfs_has_parent(sc->mp))
1603                 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1604 
1605         /*
1606          * Repair relies on the ILOCK to quiesce all possible dirent updates.
1607          * Replay all queued dirent updates into the tempdir before exchanging
1608          * the contents, even if that means dropping the ILOCKs and the
1609          * transaction.
1610          */
1611         do {
1612                 error = xrep_dir_replay_updates(rd);
1613                 if (error)
1614                         return error;
1615 
1616                 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1617                 if (error)
1618                         return error;
1619 
1620                 if (xfarray_length(rd->dir_entries) == 0)
1621                         break;
1622 
1623                 xchk_trans_cancel(sc);
1624                 xrep_tempfile_iunlock_both(sc);
1625         } while (!xchk_should_terminate(sc, &error));
1626         return error;
1627 }
1628 
1629 /* Exchange the temporary directory's data fork with the one being repaired. */
1630 STATIC int
1631 xrep_dir_swap(
1632         struct xrep_dir         *rd)
1633 {
1634         struct xfs_scrub        *sc = rd->sc;
1635         bool                    ip_local, temp_local;
1636         int                     error = 0;
1637 
1638         /*
1639          * If we never found the parent for this directory, temporarily assign
1640          * the root dir as the parent; we'll move this to the orphanage after
1641          * exchanging the dir contents.  We hold the ILOCK of the dir being
1642          * repaired, so we're not worried about racy updates of dotdot.
1643          */
1644         ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1645         if (rd->pscan.parent_ino == NULLFSINO) {
1646                 rd->needs_adoption = true;
1647                 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1648         }
1649 
1650         /*
1651          * Reset the temporary directory's '..' entry to point to the parent
1652          * that we found.  The temporary directory was created with the root
1653          * directory as the parent, so we can skip this if repairing a
1654          * subdirectory of the root.
1655          *
1656          * It's also possible that this replacement could also expand a sf
1657          * tempdir into block format.
1658          */
1659         if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
1660                 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1661                                 rd->pscan.parent_ino, rd->tx.req.resblks);
1662                 if (error)
1663                         return error;
1664         }
1665 
1666         /*
1667          * Changing the dot and dotdot entries could have changed the shape of
1668          * the directory, so we recompute these.
1669          */
1670         ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1671         temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1672 
1673         /*
1674          * If the both files have a local format data fork and the rebuilt
1675          * directory data would fit in the repaired file's data fork, copy
1676          * the contents from the tempfile and update the directory link count.
1677          * We're done now.
1678          */
1679         if (ip_local && temp_local &&
1680             sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1681                 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1682                 return xrep_dir_set_nlink(rd);
1683         }
1684 
1685         /*
1686          * Clean the transaction before we start working on exchanging
1687          * directory contents.
1688          */
1689         error = xrep_tempfile_roll_trans(rd->sc);
1690         if (error)
1691                 return error;
1692 
1693         /* Otherwise, make sure both data forks are in block-mapping mode. */
1694         error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1695         if (error)
1696                 return error;
1697 
1698         /*
1699          * Set nlink of the directory in the same transaction sequence that
1700          * (atomically) commits the new directory data.
1701          */
1702         error = xrep_dir_set_nlink(rd);
1703         if (error)
1704                 return error;
1705 
1706         return xrep_tempexch_contents(sc, &rd->tx);
1707 }
1708 
1709 /*
1710  * Exchange the new directory contents (which we created in the tempfile) with
1711  * the directory being repaired.
1712  */
1713 STATIC int
1714 xrep_dir_rebuild_tree(
1715         struct xrep_dir         *rd)
1716 {
1717         struct xfs_scrub        *sc = rd->sc;
1718         int                     error;
1719 
1720         trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1721 
1722         /*
1723          * Take the IOLOCK on the temporary file so that we can run dir
1724          * operations with the same locks held as we would for a normal file.
1725          * We still hold sc->ip's IOLOCK.
1726          */
1727         error = xrep_tempfile_iolock_polled(rd->sc);
1728         if (error)
1729                 return error;
1730 
1731         /*
1732          * Allocate transaction, lock inodes, and make sure that we've replayed
1733          * all the stashed dirent updates to the tempdir.  After this point,
1734          * we're ready to exchange data fork mappings.
1735          */
1736         error = xrep_dir_finalize_tempdir(rd);
1737         if (error)
1738                 return error;
1739 
1740         if (xchk_iscan_aborted(&rd->pscan.iscan))
1741                 return -ECANCELED;
1742 
1743         /*
1744          * Exchange the tempdir's data fork with the file being repaired.  This
1745          * recreates the transaction and re-takes the ILOCK in the scrub
1746          * context.
1747          */
1748         error = xrep_dir_swap(rd);
1749         if (error)
1750                 return error;
1751 
1752         /*
1753          * Release the old directory blocks and reset the data fork of the temp
1754          * directory to an empty shortform directory because inactivation does
1755          * nothing for directories.
1756          */
1757         error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1758         if (error)
1759                 return error;
1760 
1761         /*
1762          * Roll to get a transaction without any inodes joined to it.  Then we
1763          * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1764          * the scrub target directory.
1765          */
1766         error = xfs_trans_roll(&sc->tp);
1767         if (error)
1768                 return error;
1769 
1770         xrep_tempfile_iunlock(sc);
1771         xrep_tempfile_iounlock(sc);
1772         return 0;
1773 }
1774 
1775 /* Set up the filesystem scan so we can regenerate directory entries. */
1776 STATIC int
1777 xrep_dir_setup_scan(
1778         struct xrep_dir         *rd)
1779 {
1780         struct xfs_scrub        *sc = rd->sc;
1781         char                    *descr;
1782         int                     error;
1783 
1784         /* Set up some staging memory for salvaging dirents. */
1785         descr = xchk_xfile_ino_descr(sc, "directory entries");
1786         error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
1787                         &rd->dir_entries);
1788         kfree(descr);
1789         if (error)
1790                 return error;
1791 
1792         descr = xchk_xfile_ino_descr(sc, "directory entry names");
1793         error = xfblob_create(descr, &rd->dir_names);
1794         kfree(descr);
1795         if (error)
1796                 goto out_xfarray;
1797 
1798         if (xfs_has_parent(sc->mp))
1799                 error = __xrep_findparent_scan_start(sc, &rd->pscan,
1800                                 xrep_dir_live_update);
1801         else
1802                 error = xrep_findparent_scan_start(sc, &rd->pscan);
1803         if (error)
1804                 goto out_xfblob;
1805 
1806         return 0;
1807 
1808 out_xfblob:
1809         xfblob_destroy(rd->dir_names);
1810         rd->dir_names = NULL;
1811 out_xfarray:
1812         xfarray_destroy(rd->dir_entries);
1813         rd->dir_entries = NULL;
1814         return error;
1815 }
1816 
1817 /*
1818  * Move the current file to the orphanage.
1819  *
1820  * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks.  Upon
1821  * successful return, the scrub transaction will have enough extra reservation
1822  * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1823  * orphanage; and both inodes will be ijoined.
1824  */
1825 STATIC int
1826 xrep_dir_move_to_orphanage(
1827         struct xrep_dir         *rd)
1828 {
1829         struct xfs_scrub        *sc = rd->sc;
1830         xfs_ino_t               orig_parent, new_parent;
1831         int                     error;
1832 
1833         /*
1834          * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1835          * prepare for the adoption.  Therefore, look up the old dotdot entry
1836          * for sc->ip so that we can compare it after we re-lock sc->ip.
1837          */
1838         error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1839         if (error)
1840                 return error;
1841 
1842         /*
1843          * Drop the ILOCK on the scrub target and commit the transaction.
1844          * Adoption computes its own resource requirements and gathers the
1845          * necessary components.
1846          */
1847         error = xrep_trans_commit(sc);
1848         if (error)
1849                 return error;
1850         xchk_iunlock(sc, XFS_ILOCK_EXCL);
1851 
1852         /* If we can take the orphanage's iolock then we're ready to move. */
1853         if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1854                 xchk_iunlock(sc, sc->ilock_flags);
1855                 error = xrep_orphanage_iolock_two(sc);
1856                 if (error)
1857                         return error;
1858         }
1859 
1860         /* Grab transaction and ILOCK the two files. */
1861         error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1862         if (error)
1863                 return error;
1864 
1865         error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1866         if (error)
1867                 return error;
1868 
1869         /*
1870          * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1871          * entry again.  If the parent changed or the child was unlinked while
1872          * the child directory was unlocked, we don't need to move the child to
1873          * the orphanage after all.
1874          */
1875         error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1876         if (error)
1877                 return error;
1878 
1879         /*
1880          * Attach to the orphanage if we still have a linked directory and it
1881          * hasn't been moved.
1882          */
1883         if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1884                 error = xrep_adoption_move(&rd->adoption);
1885                 if (error)
1886                         return error;
1887         }
1888 
1889         /*
1890          * Launder the scrub transaction so we can drop the orphanage ILOCK
1891          * and IOLOCK.  Return holding the scrub target's ILOCK and IOLOCK.
1892          */
1893         error = xrep_adoption_trans_roll(&rd->adoption);
1894         if (error)
1895                 return error;
1896 
1897         xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1898         xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1899         return 0;
1900 }
1901 
1902 /*
1903  * Repair the directory metadata.
1904  *
1905  * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
1906  * cache in XFS can't handle aliased multiblock buffers, so this might
1907  * misbehave if the directory blocks are crosslinked with other filesystem
1908  * metadata.
1909  *
1910  * XXX: Is it necessary to check the dcache for this directory to make sure
1911  * that we always recreate every cached entry?
1912  */
1913 int
1914 xrep_directory(
1915         struct xfs_scrub        *sc)
1916 {
1917         struct xrep_dir         *rd = sc->buf;
1918         int                     error;
1919 
1920         /* The rmapbt is required to reap the old data fork. */
1921         if (!xfs_has_rmapbt(sc->mp))
1922                 return -EOPNOTSUPP;
1923         /* We require atomic file exchange range to rebuild anything. */
1924         if (!xfs_has_exchange_range(sc->mp))
1925                 return -EOPNOTSUPP;
1926 
1927         error = xrep_dir_setup_scan(rd);
1928         if (error)
1929                 return error;
1930 
1931         if (xfs_has_parent(sc->mp))
1932                 error = xrep_dir_scan_dirtree(rd);
1933         else
1934                 error = xrep_dir_salvage_entries(rd);
1935         if (error)
1936                 goto out_teardown;
1937 
1938         /* Last chance to abort before we start committing fixes. */
1939         if (xchk_should_terminate(sc, &error))
1940                 goto out_teardown;
1941 
1942         error = xrep_dir_rebuild_tree(rd);
1943         if (error)
1944                 goto out_teardown;
1945 
1946         if (rd->needs_adoption) {
1947                 if (!xrep_orphanage_can_adopt(rd->sc))
1948                         error = -EFSCORRUPTED;
1949                 else
1950                         error = xrep_dir_move_to_orphanage(rd);
1951                 if (error)
1952                         goto out_teardown;
1953         }
1954 
1955 out_teardown:
1956         xrep_dir_teardown(sc);
1957         return error;
1958 }
1959 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php