~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/xfs/xfs_inode.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  4  * All Rights Reserved.
  5  */
  6 #include <linux/iversion.h>
  7 
  8 #include "xfs.h"
  9 #include "xfs_fs.h"
 10 #include "xfs_shared.h"
 11 #include "xfs_format.h"
 12 #include "xfs_log_format.h"
 13 #include "xfs_trans_resv.h"
 14 #include "xfs_mount.h"
 15 #include "xfs_defer.h"
 16 #include "xfs_inode.h"
 17 #include "xfs_dir2.h"
 18 #include "xfs_attr.h"
 19 #include "xfs_bit.h"
 20 #include "xfs_trans_space.h"
 21 #include "xfs_trans.h"
 22 #include "xfs_buf_item.h"
 23 #include "xfs_inode_item.h"
 24 #include "xfs_iunlink_item.h"
 25 #include "xfs_ialloc.h"
 26 #include "xfs_bmap.h"
 27 #include "xfs_bmap_util.h"
 28 #include "xfs_errortag.h"
 29 #include "xfs_error.h"
 30 #include "xfs_quota.h"
 31 #include "xfs_filestream.h"
 32 #include "xfs_trace.h"
 33 #include "xfs_icache.h"
 34 #include "xfs_symlink.h"
 35 #include "xfs_trans_priv.h"
 36 #include "xfs_log.h"
 37 #include "xfs_bmap_btree.h"
 38 #include "xfs_reflink.h"
 39 #include "xfs_ag.h"
 40 #include "xfs_log_priv.h"
 41 #include "xfs_health.h"
 42 #include "xfs_pnfs.h"
 43 #include "xfs_parent.h"
 44 #include "xfs_xattr.h"
 45 #include "xfs_inode_util.h"
 46 
 47 struct kmem_cache *xfs_inode_cache;
 48 
 49 /*
 50  * These two are wrapper routines around the xfs_ilock() routine used to
 51  * centralize some grungy code.  They are used in places that wish to lock the
 52  * inode solely for reading the extents.  The reason these places can't just
 53  * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
 54  * bringing in of the extents from disk for a file in b-tree format.  If the
 55  * inode is in b-tree format, then we need to lock the inode exclusively until
 56  * the extents are read in.  Locking it exclusively all the time would limit
 57  * our parallelism unnecessarily, though.  What we do instead is check to see
 58  * if the extents have been read in yet, and only lock the inode exclusively
 59  * if they have not.
 60  *
 61  * The functions return a value which should be given to the corresponding
 62  * xfs_iunlock() call.
 63  */
 64 uint
 65 xfs_ilock_data_map_shared(
 66         struct xfs_inode        *ip)
 67 {
 68         uint                    lock_mode = XFS_ILOCK_SHARED;
 69 
 70         if (xfs_need_iread_extents(&ip->i_df))
 71                 lock_mode = XFS_ILOCK_EXCL;
 72         xfs_ilock(ip, lock_mode);
 73         return lock_mode;
 74 }
 75 
 76 uint
 77 xfs_ilock_attr_map_shared(
 78         struct xfs_inode        *ip)
 79 {
 80         uint                    lock_mode = XFS_ILOCK_SHARED;
 81 
 82         if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
 83                 lock_mode = XFS_ILOCK_EXCL;
 84         xfs_ilock(ip, lock_mode);
 85         return lock_mode;
 86 }
 87 
 88 /*
 89  * You can't set both SHARED and EXCL for the same lock,
 90  * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
 91  * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
 92  * to set in lock_flags.
 93  */
 94 static inline void
 95 xfs_lock_flags_assert(
 96         uint            lock_flags)
 97 {
 98         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 99                 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
100         ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
101                 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
102         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
103                 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
104         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
105         ASSERT(lock_flags != 0);
106 }
107 
108 /*
109  * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
110  * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
111  * various combinations of the locks to be obtained.
112  *
113  * The 3 locks should always be ordered so that the IO lock is obtained first,
114  * the mmap lock second and the ilock last in order to prevent deadlock.
115  *
116  * Basic locking order:
117  *
118  * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
119  *
120  * mmap_lock locking order:
121  *
122  * i_rwsem -> page lock -> mmap_lock
123  * mmap_lock -> invalidate_lock -> page_lock
124  *
125  * The difference in mmap_lock locking order mean that we cannot hold the
126  * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
127  * can fault in pages during copy in/out (for buffered IO) or require the
128  * mmap_lock in get_user_pages() to map the user pages into the kernel address
129  * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
130  * fault because page faults already hold the mmap_lock.
131  *
132  * Hence to serialise fully against both syscall and mmap based IO, we need to
133  * take both the i_rwsem and the invalidate_lock. These locks should *only* be
134  * both taken in places where we need to invalidate the page cache in a race
135  * free manner (e.g. truncate, hole punch and other extent manipulation
136  * functions).
137  */
138 void
139 xfs_ilock(
140         xfs_inode_t             *ip,
141         uint                    lock_flags)
142 {
143         trace_xfs_ilock(ip, lock_flags, _RET_IP_);
144 
145         xfs_lock_flags_assert(lock_flags);
146 
147         if (lock_flags & XFS_IOLOCK_EXCL) {
148                 down_write_nested(&VFS_I(ip)->i_rwsem,
149                                   XFS_IOLOCK_DEP(lock_flags));
150         } else if (lock_flags & XFS_IOLOCK_SHARED) {
151                 down_read_nested(&VFS_I(ip)->i_rwsem,
152                                  XFS_IOLOCK_DEP(lock_flags));
153         }
154 
155         if (lock_flags & XFS_MMAPLOCK_EXCL) {
156                 down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
157                                   XFS_MMAPLOCK_DEP(lock_flags));
158         } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
159                 down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
160                                  XFS_MMAPLOCK_DEP(lock_flags));
161         }
162 
163         if (lock_flags & XFS_ILOCK_EXCL)
164                 down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
165         else if (lock_flags & XFS_ILOCK_SHARED)
166                 down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
167 }
168 
169 /*
170  * This is just like xfs_ilock(), except that the caller
171  * is guaranteed not to sleep.  It returns 1 if it gets
172  * the requested locks and 0 otherwise.  If the IO lock is
173  * obtained but the inode lock cannot be, then the IO lock
174  * is dropped before returning.
175  *
176  * ip -- the inode being locked
177  * lock_flags -- this parameter indicates the inode's locks to be
178  *       to be locked.  See the comment for xfs_ilock() for a list
179  *       of valid values.
180  */
181 int
182 xfs_ilock_nowait(
183         xfs_inode_t             *ip,
184         uint                    lock_flags)
185 {
186         trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
187 
188         xfs_lock_flags_assert(lock_flags);
189 
190         if (lock_flags & XFS_IOLOCK_EXCL) {
191                 if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
192                         goto out;
193         } else if (lock_flags & XFS_IOLOCK_SHARED) {
194                 if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
195                         goto out;
196         }
197 
198         if (lock_flags & XFS_MMAPLOCK_EXCL) {
199                 if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
200                         goto out_undo_iolock;
201         } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
202                 if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
203                         goto out_undo_iolock;
204         }
205 
206         if (lock_flags & XFS_ILOCK_EXCL) {
207                 if (!down_write_trylock(&ip->i_lock))
208                         goto out_undo_mmaplock;
209         } else if (lock_flags & XFS_ILOCK_SHARED) {
210                 if (!down_read_trylock(&ip->i_lock))
211                         goto out_undo_mmaplock;
212         }
213         return 1;
214 
215 out_undo_mmaplock:
216         if (lock_flags & XFS_MMAPLOCK_EXCL)
217                 up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
218         else if (lock_flags & XFS_MMAPLOCK_SHARED)
219                 up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
220 out_undo_iolock:
221         if (lock_flags & XFS_IOLOCK_EXCL)
222                 up_write(&VFS_I(ip)->i_rwsem);
223         else if (lock_flags & XFS_IOLOCK_SHARED)
224                 up_read(&VFS_I(ip)->i_rwsem);
225 out:
226         return 0;
227 }
228 
229 /*
230  * xfs_iunlock() is used to drop the inode locks acquired with
231  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
232  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
233  * that we know which locks to drop.
234  *
235  * ip -- the inode being unlocked
236  * lock_flags -- this parameter indicates the inode's locks to be
237  *       to be unlocked.  See the comment for xfs_ilock() for a list
238  *       of valid values for this parameter.
239  *
240  */
241 void
242 xfs_iunlock(
243         xfs_inode_t             *ip,
244         uint                    lock_flags)
245 {
246         xfs_lock_flags_assert(lock_flags);
247 
248         if (lock_flags & XFS_IOLOCK_EXCL)
249                 up_write(&VFS_I(ip)->i_rwsem);
250         else if (lock_flags & XFS_IOLOCK_SHARED)
251                 up_read(&VFS_I(ip)->i_rwsem);
252 
253         if (lock_flags & XFS_MMAPLOCK_EXCL)
254                 up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
255         else if (lock_flags & XFS_MMAPLOCK_SHARED)
256                 up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
257 
258         if (lock_flags & XFS_ILOCK_EXCL)
259                 up_write(&ip->i_lock);
260         else if (lock_flags & XFS_ILOCK_SHARED)
261                 up_read(&ip->i_lock);
262 
263         trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
264 }
265 
266 /*
267  * give up write locks.  the i/o lock cannot be held nested
268  * if it is being demoted.
269  */
270 void
271 xfs_ilock_demote(
272         xfs_inode_t             *ip,
273         uint                    lock_flags)
274 {
275         ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
276         ASSERT((lock_flags &
277                 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
278 
279         if (lock_flags & XFS_ILOCK_EXCL)
280                 downgrade_write(&ip->i_lock);
281         if (lock_flags & XFS_MMAPLOCK_EXCL)
282                 downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
283         if (lock_flags & XFS_IOLOCK_EXCL)
284                 downgrade_write(&VFS_I(ip)->i_rwsem);
285 
286         trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
287 }
288 
289 void
290 xfs_assert_ilocked(
291         struct xfs_inode        *ip,
292         uint                    lock_flags)
293 {
294         /*
295          * Sometimes we assert the ILOCK is held exclusively, but we're in
296          * a workqueue, so lockdep doesn't know we're the owner.
297          */
298         if (lock_flags & XFS_ILOCK_SHARED)
299                 rwsem_assert_held(&ip->i_lock);
300         else if (lock_flags & XFS_ILOCK_EXCL)
301                 rwsem_assert_held_write_nolockdep(&ip->i_lock);
302 
303         if (lock_flags & XFS_MMAPLOCK_SHARED)
304                 rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock);
305         else if (lock_flags & XFS_MMAPLOCK_EXCL)
306                 rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock);
307 
308         if (lock_flags & XFS_IOLOCK_SHARED)
309                 rwsem_assert_held(&VFS_I(ip)->i_rwsem);
310         else if (lock_flags & XFS_IOLOCK_EXCL)
311                 rwsem_assert_held_write(&VFS_I(ip)->i_rwsem);
312 }
313 
314 /*
315  * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
316  * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
317  * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
318  * errors and warnings.
319  */
320 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
321 static bool
322 xfs_lockdep_subclass_ok(
323         int subclass)
324 {
325         return subclass < MAX_LOCKDEP_SUBCLASSES;
326 }
327 #else
328 #define xfs_lockdep_subclass_ok(subclass)       (true)
329 #endif
330 
331 /*
332  * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
333  * value. This can be called for any type of inode lock combination, including
334  * parent locking. Care must be taken to ensure we don't overrun the subclass
335  * storage fields in the class mask we build.
336  */
337 static inline uint
338 xfs_lock_inumorder(
339         uint    lock_mode,
340         uint    subclass)
341 {
342         uint    class = 0;
343 
344         ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
345                               XFS_ILOCK_RTSUM)));
346         ASSERT(xfs_lockdep_subclass_ok(subclass));
347 
348         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
349                 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
350                 class += subclass << XFS_IOLOCK_SHIFT;
351         }
352 
353         if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
354                 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
355                 class += subclass << XFS_MMAPLOCK_SHIFT;
356         }
357 
358         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
359                 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
360                 class += subclass << XFS_ILOCK_SHIFT;
361         }
362 
363         return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
364 }
365 
366 /*
367  * The following routine will lock n inodes in exclusive mode.  We assume the
368  * caller calls us with the inodes in i_ino order.
369  *
370  * We need to detect deadlock where an inode that we lock is in the AIL and we
371  * start waiting for another inode that is locked by a thread in a long running
372  * transaction (such as truncate). This can result in deadlock since the long
373  * running trans might need to wait for the inode we just locked in order to
374  * push the tail and free space in the log.
375  *
376  * xfs_lock_inodes() can only be used to lock one type of lock at a time -
377  * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
378  * lock more than one at a time, lockdep will report false positives saying we
379  * have violated locking orders.
380  */
381 void
382 xfs_lock_inodes(
383         struct xfs_inode        **ips,
384         int                     inodes,
385         uint                    lock_mode)
386 {
387         int                     attempts = 0;
388         uint                    i;
389         int                     j;
390         bool                    try_lock;
391         struct xfs_log_item     *lp;
392 
393         /*
394          * Currently supports between 2 and 5 inodes with exclusive locking.  We
395          * support an arbitrary depth of locking here, but absolute limits on
396          * inodes depend on the type of locking and the limits placed by
397          * lockdep annotations in xfs_lock_inumorder.  These are all checked by
398          * the asserts.
399          */
400         ASSERT(ips && inodes >= 2 && inodes <= 5);
401         ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
402                             XFS_ILOCK_EXCL));
403         ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
404                               XFS_ILOCK_SHARED)));
405         ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
406                 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
407         ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
408                 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
409 
410         if (lock_mode & XFS_IOLOCK_EXCL) {
411                 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
412         } else if (lock_mode & XFS_MMAPLOCK_EXCL)
413                 ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
414 
415 again:
416         try_lock = false;
417         i = 0;
418         for (; i < inodes; i++) {
419                 ASSERT(ips[i]);
420 
421                 if (i && (ips[i] == ips[i - 1]))        /* Already locked */
422                         continue;
423 
424                 /*
425                  * If try_lock is not set yet, make sure all locked inodes are
426                  * not in the AIL.  If any are, set try_lock to be used later.
427                  */
428                 if (!try_lock) {
429                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
430                                 lp = &ips[j]->i_itemp->ili_item;
431                                 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
432                                         try_lock = true;
433                         }
434                 }
435 
436                 /*
437                  * If any of the previous locks we have locked is in the AIL,
438                  * we must TRY to get the second and subsequent locks. If
439                  * we can't get any, we must release all we have
440                  * and try again.
441                  */
442                 if (!try_lock) {
443                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
444                         continue;
445                 }
446 
447                 /* try_lock means we have an inode locked that is in the AIL. */
448                 ASSERT(i != 0);
449                 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
450                         continue;
451 
452                 /*
453                  * Unlock all previous guys and try again.  xfs_iunlock will try
454                  * to push the tail if the inode is in the AIL.
455                  */
456                 attempts++;
457                 for (j = i - 1; j >= 0; j--) {
458                         /*
459                          * Check to see if we've already unlocked this one.  Not
460                          * the first one going back, and the inode ptr is the
461                          * same.
462                          */
463                         if (j != (i - 1) && ips[j] == ips[j + 1])
464                                 continue;
465 
466                         xfs_iunlock(ips[j], lock_mode);
467                 }
468 
469                 if ((attempts % 5) == 0) {
470                         delay(1); /* Don't just spin the CPU */
471                 }
472                 goto again;
473         }
474 }
475 
476 /*
477  * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
478  * mmaplock must be double-locked separately since we use i_rwsem and
479  * invalidate_lock for that. We now support taking one lock EXCL and the
480  * other SHARED.
481  */
482 void
483 xfs_lock_two_inodes(
484         struct xfs_inode        *ip0,
485         uint                    ip0_mode,
486         struct xfs_inode        *ip1,
487         uint                    ip1_mode)
488 {
489         int                     attempts = 0;
490         struct xfs_log_item     *lp;
491 
492         ASSERT(hweight32(ip0_mode) == 1);
493         ASSERT(hweight32(ip1_mode) == 1);
494         ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
495         ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
496         ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
497         ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
498         ASSERT(ip0->i_ino != ip1->i_ino);
499 
500         if (ip0->i_ino > ip1->i_ino) {
501                 swap(ip0, ip1);
502                 swap(ip0_mode, ip1_mode);
503         }
504 
505  again:
506         xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
507 
508         /*
509          * If the first lock we have locked is in the AIL, we must TRY to get
510          * the second lock. If we can't get it, we must release the first one
511          * and try again.
512          */
513         lp = &ip0->i_itemp->ili_item;
514         if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
515                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
516                         xfs_iunlock(ip0, ip0_mode);
517                         if ((++attempts % 5) == 0)
518                                 delay(1); /* Don't just spin the CPU */
519                         goto again;
520                 }
521         } else {
522                 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
523         }
524 }
525 
526 /*
527  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
528  * is allowed, otherwise it has to be an exact match. If a CI match is found,
529  * ci_name->name will point to a the actual name (caller must free) or
530  * will be set to NULL if an exact match is found.
531  */
532 int
533 xfs_lookup(
534         struct xfs_inode        *dp,
535         const struct xfs_name   *name,
536         struct xfs_inode        **ipp,
537         struct xfs_name         *ci_name)
538 {
539         xfs_ino_t               inum;
540         int                     error;
541 
542         trace_xfs_lookup(dp, name);
543 
544         if (xfs_is_shutdown(dp->i_mount))
545                 return -EIO;
546         if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
547                 return -EIO;
548 
549         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
550         if (error)
551                 goto out_unlock;
552 
553         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
554         if (error)
555                 goto out_free_name;
556 
557         return 0;
558 
559 out_free_name:
560         if (ci_name)
561                 kfree(ci_name->name);
562 out_unlock:
563         *ipp = NULL;
564         return error;
565 }
566 
567 /*
568  * Initialise a newly allocated inode and return the in-core inode to the
569  * caller locked exclusively.
570  *
571  * Caller is responsible for unlocking the inode manually upon return
572  */
573 int
574 xfs_icreate(
575         struct xfs_trans        *tp,
576         xfs_ino_t               ino,
577         const struct xfs_icreate_args *args,
578         struct xfs_inode        **ipp)
579 {
580         struct xfs_mount        *mp = tp->t_mountp;
581         struct xfs_inode        *ip = NULL;
582         int                     error;
583 
584         /*
585          * Get the in-core inode with the lock held exclusively to prevent
586          * others from looking at until we're done.
587          */
588         error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
589         if (error)
590                 return error;
591 
592         ASSERT(ip != NULL);
593         xfs_trans_ijoin(tp, ip, 0);
594         xfs_inode_init(tp, args, ip);
595 
596         /* now that we have an i_mode we can setup the inode structure */
597         xfs_setup_inode(ip);
598 
599         *ipp = ip;
600         return 0;
601 }
602 
603 /* Return dquots for the ids that will be assigned to a new file. */
604 int
605 xfs_icreate_dqalloc(
606         const struct xfs_icreate_args   *args,
607         struct xfs_dquot                **udqpp,
608         struct xfs_dquot                **gdqpp,
609         struct xfs_dquot                **pdqpp)
610 {
611         struct inode                    *dir = VFS_I(args->pip);
612         kuid_t                          uid = GLOBAL_ROOT_UID;
613         kgid_t                          gid = GLOBAL_ROOT_GID;
614         prid_t                          prid = 0;
615         unsigned int                    flags = XFS_QMOPT_QUOTALL;
616 
617         if (args->idmap) {
618                 /*
619                  * The uid/gid computation code must match what the VFS uses to
620                  * assign i_[ug]id.  INHERIT adjusts the gid computation for
621                  * setgid/grpid systems.
622                  */
623                 uid = mapped_fsuid(args->idmap, i_user_ns(dir));
624                 gid = mapped_fsgid(args->idmap, i_user_ns(dir));
625                 prid = xfs_get_initial_prid(args->pip);
626                 flags |= XFS_QMOPT_INHERIT;
627         }
628 
629         *udqpp = *gdqpp = *pdqpp = NULL;
630 
631         return xfs_qm_vop_dqalloc(args->pip, uid, gid, prid, flags, udqpp,
632                         gdqpp, pdqpp);
633 }
634 
635 int
636 xfs_create(
637         const struct xfs_icreate_args *args,
638         struct xfs_name         *name,
639         struct xfs_inode        **ipp)
640 {
641         struct xfs_inode        *dp = args->pip;
642         struct xfs_dir_update   du = {
643                 .dp             = dp,
644                 .name           = name,
645         };
646         struct xfs_mount        *mp = dp->i_mount;
647         struct xfs_trans        *tp = NULL;
648         struct xfs_dquot        *udqp;
649         struct xfs_dquot        *gdqp;
650         struct xfs_dquot        *pdqp;
651         struct xfs_trans_res    *tres;
652         xfs_ino_t               ino;
653         bool                    unlock_dp_on_error = false;
654         bool                    is_dir = S_ISDIR(args->mode);
655         uint                    resblks;
656         int                     error;
657 
658         trace_xfs_create(dp, name);
659 
660         if (xfs_is_shutdown(mp))
661                 return -EIO;
662         if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
663                 return -EIO;
664 
665         /* Make sure that we have allocated dquot(s) on disk. */
666         error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
667         if (error)
668                 return error;
669 
670         if (is_dir) {
671                 resblks = xfs_mkdir_space_res(mp, name->len);
672                 tres = &M_RES(mp)->tr_mkdir;
673         } else {
674                 resblks = xfs_create_space_res(mp, name->len);
675                 tres = &M_RES(mp)->tr_create;
676         }
677 
678         error = xfs_parent_start(mp, &du.ppargs);
679         if (error)
680                 goto out_release_dquots;
681 
682         /*
683          * Initially assume that the file does not exist and
684          * reserve the resources for that case.  If that is not
685          * the case we'll drop the one we have and get a more
686          * appropriate transaction later.
687          */
688         error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
689                         &tp);
690         if (error == -ENOSPC) {
691                 /* flush outstanding delalloc blocks and retry */
692                 xfs_flush_inodes(mp);
693                 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
694                                 resblks, &tp);
695         }
696         if (error)
697                 goto out_parent;
698 
699         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
700         unlock_dp_on_error = true;
701 
702         /*
703          * A newly created regular or special file just has one directory
704          * entry pointing to them, but a directory also the "." entry
705          * pointing to itself.
706          */
707         error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
708         if (!error)
709                 error = xfs_icreate(tp, ino, args, &du.ip);
710         if (error)
711                 goto out_trans_cancel;
712 
713         /*
714          * Now we join the directory inode to the transaction.  We do not do it
715          * earlier because xfs_dialloc might commit the previous transaction
716          * (and release all the locks).  An error from here on will result in
717          * the transaction cancel unlocking dp so don't do it explicitly in the
718          * error path.
719          */
720         xfs_trans_ijoin(tp, dp, 0);
721 
722         error = xfs_dir_create_child(tp, resblks, &du);
723         if (error)
724                 goto out_trans_cancel;
725 
726         /*
727          * If this is a synchronous mount, make sure that the
728          * create transaction goes to disk before returning to
729          * the user.
730          */
731         if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
732                 xfs_trans_set_sync(tp);
733 
734         /*
735          * Attach the dquot(s) to the inodes and modify them incore.
736          * These ids of the inode couldn't have changed since the new
737          * inode has been locked ever since it was created.
738          */
739         xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);
740 
741         error = xfs_trans_commit(tp);
742         if (error)
743                 goto out_release_inode;
744 
745         xfs_qm_dqrele(udqp);
746         xfs_qm_dqrele(gdqp);
747         xfs_qm_dqrele(pdqp);
748 
749         *ipp = du.ip;
750         xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
751         xfs_iunlock(dp, XFS_ILOCK_EXCL);
752         xfs_parent_finish(mp, du.ppargs);
753         return 0;
754 
755  out_trans_cancel:
756         xfs_trans_cancel(tp);
757  out_release_inode:
758         /*
759          * Wait until after the current transaction is aborted to finish the
760          * setup of the inode and release the inode.  This prevents recursive
761          * transactions and deadlocks from xfs_inactive.
762          */
763         if (du.ip) {
764                 xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
765                 xfs_finish_inode_setup(du.ip);
766                 xfs_irele(du.ip);
767         }
768  out_parent:
769         xfs_parent_finish(mp, du.ppargs);
770  out_release_dquots:
771         xfs_qm_dqrele(udqp);
772         xfs_qm_dqrele(gdqp);
773         xfs_qm_dqrele(pdqp);
774 
775         if (unlock_dp_on_error)
776                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
777         return error;
778 }
779 
780 int
781 xfs_create_tmpfile(
782         const struct xfs_icreate_args *args,
783         struct xfs_inode        **ipp)
784 {
785         struct xfs_inode        *dp = args->pip;
786         struct xfs_mount        *mp = dp->i_mount;
787         struct xfs_inode        *ip = NULL;
788         struct xfs_trans        *tp = NULL;
789         struct xfs_dquot        *udqp;
790         struct xfs_dquot        *gdqp;
791         struct xfs_dquot        *pdqp;
792         struct xfs_trans_res    *tres;
793         xfs_ino_t               ino;
794         uint                    resblks;
795         int                     error;
796 
797         ASSERT(args->flags & XFS_ICREATE_TMPFILE);
798 
799         if (xfs_is_shutdown(mp))
800                 return -EIO;
801 
802         /* Make sure that we have allocated dquot(s) on disk. */
803         error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
804         if (error)
805                 return error;
806 
807         resblks = XFS_IALLOC_SPACE_RES(mp);
808         tres = &M_RES(mp)->tr_create_tmpfile;
809 
810         error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
811                         &tp);
812         if (error)
813                 goto out_release_dquots;
814 
815         error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
816         if (!error)
817                 error = xfs_icreate(tp, ino, args, &ip);
818         if (error)
819                 goto out_trans_cancel;
820 
821         if (xfs_has_wsync(mp))
822                 xfs_trans_set_sync(tp);
823 
824         /*
825          * Attach the dquot(s) to the inodes and modify them incore.
826          * These ids of the inode couldn't have changed since the new
827          * inode has been locked ever since it was created.
828          */
829         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
830 
831         error = xfs_iunlink(tp, ip);
832         if (error)
833                 goto out_trans_cancel;
834 
835         error = xfs_trans_commit(tp);
836         if (error)
837                 goto out_release_inode;
838 
839         xfs_qm_dqrele(udqp);
840         xfs_qm_dqrele(gdqp);
841         xfs_qm_dqrele(pdqp);
842 
843         *ipp = ip;
844         xfs_iunlock(ip, XFS_ILOCK_EXCL);
845         return 0;
846 
847  out_trans_cancel:
848         xfs_trans_cancel(tp);
849  out_release_inode:
850         /*
851          * Wait until after the current transaction is aborted to finish the
852          * setup of the inode and release the inode.  This prevents recursive
853          * transactions and deadlocks from xfs_inactive.
854          */
855         if (ip) {
856                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
857                 xfs_finish_inode_setup(ip);
858                 xfs_irele(ip);
859         }
860  out_release_dquots:
861         xfs_qm_dqrele(udqp);
862         xfs_qm_dqrele(gdqp);
863         xfs_qm_dqrele(pdqp);
864 
865         return error;
866 }
867 
868 int
869 xfs_link(
870         struct xfs_inode        *tdp,
871         struct xfs_inode        *sip,
872         struct xfs_name         *target_name)
873 {
874         struct xfs_dir_update   du = {
875                 .dp             = tdp,
876                 .name           = target_name,
877                 .ip             = sip,
878         };
879         struct xfs_mount        *mp = tdp->i_mount;
880         struct xfs_trans        *tp;
881         int                     error, nospace_error = 0;
882         int                     resblks;
883 
884         trace_xfs_link(tdp, target_name);
885 
886         ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
887 
888         if (xfs_is_shutdown(mp))
889                 return -EIO;
890         if (xfs_ifork_zapped(tdp, XFS_DATA_FORK))
891                 return -EIO;
892 
893         error = xfs_qm_dqattach(sip);
894         if (error)
895                 goto std_return;
896 
897         error = xfs_qm_dqattach(tdp);
898         if (error)
899                 goto std_return;
900 
901         error = xfs_parent_start(mp, &du.ppargs);
902         if (error)
903                 goto std_return;
904 
905         resblks = xfs_link_space_res(mp, target_name->len);
906         error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
907                         &tp, &nospace_error);
908         if (error)
909                 goto out_parent;
910 
911         /*
912          * We don't allow reservationless or quotaless hardlinking when parent
913          * pointers are enabled because we can't back out if the xattrs must
914          * grow.
915          */
916         if (du.ppargs && nospace_error) {
917                 error = nospace_error;
918                 goto error_return;
919         }
920 
921         /*
922          * If we are using project inheritance, we only allow hard link
923          * creation in our tree when the project IDs are the same; else
924          * the tree quota mechanism could be circumvented.
925          */
926         if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
927                      tdp->i_projid != sip->i_projid)) {
928                 /*
929                  * Project quota setup skips special files which can
930                  * leave inodes in a PROJINHERIT directory without a
931                  * project ID set. We need to allow links to be made
932                  * to these "project-less" inodes because userspace
933                  * expects them to succeed after project ID setup,
934                  * but everything else should be rejected.
935                  */
936                 if (!special_file(VFS_I(sip)->i_mode) ||
937                     sip->i_projid != 0) {
938                         error = -EXDEV;
939                         goto error_return;
940                 }
941         }
942 
943         error = xfs_dir_add_child(tp, resblks, &du);
944         if (error)
945                 goto error_return;
946 
947         /*
948          * If this is a synchronous mount, make sure that the
949          * link transaction goes to disk before returning to
950          * the user.
951          */
952         if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
953                 xfs_trans_set_sync(tp);
954 
955         error = xfs_trans_commit(tp);
956         xfs_iunlock(tdp, XFS_ILOCK_EXCL);
957         xfs_iunlock(sip, XFS_ILOCK_EXCL);
958         xfs_parent_finish(mp, du.ppargs);
959         return error;
960 
961  error_return:
962         xfs_trans_cancel(tp);
963         xfs_iunlock(tdp, XFS_ILOCK_EXCL);
964         xfs_iunlock(sip, XFS_ILOCK_EXCL);
965  out_parent:
966         xfs_parent_finish(mp, du.ppargs);
967  std_return:
968         if (error == -ENOSPC && nospace_error)
969                 error = nospace_error;
970         return error;
971 }
972 
973 /* Clear the reflink flag and the cowblocks tag if possible. */
974 static void
975 xfs_itruncate_clear_reflink_flags(
976         struct xfs_inode        *ip)
977 {
978         struct xfs_ifork        *dfork;
979         struct xfs_ifork        *cfork;
980 
981         if (!xfs_is_reflink_inode(ip))
982                 return;
983         dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
984         cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
985         if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
986                 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
987         if (cfork->if_bytes == 0)
988                 xfs_inode_clear_cowblocks_tag(ip);
989 }
990 
991 /*
992  * Free up the underlying blocks past new_size.  The new size must be smaller
993  * than the current size.  This routine can be used both for the attribute and
994  * data fork, and does not modify the inode size, which is left to the caller.
995  *
996  * The transaction passed to this routine must have made a permanent log
997  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
998  * given transaction and start new ones, so make sure everything involved in
999  * the transaction is tidy before calling here.  Some transaction will be
1000  * returned to the caller to be committed.  The incoming transaction must
1001  * already include the inode, and both inode locks must be held exclusively.
1002  * The inode must also be "held" within the transaction.  On return the inode
1003  * will be "held" within the returned transaction.  This routine does NOT
1004  * require any disk space to be reserved for it within the transaction.
1005  *
1006  * If we get an error, we must return with the inode locked and linked into the
1007  * current transaction. This keeps things simple for the higher level code,
1008  * because it always knows that the inode is locked and held in the transaction
1009  * that returns to it whether errors occur or not.  We don't mark the inode
1010  * dirty on error so that transactions can be easily aborted if possible.
1011  */
1012 int
1013 xfs_itruncate_extents_flags(
1014         struct xfs_trans        **tpp,
1015         struct xfs_inode        *ip,
1016         int                     whichfork,
1017         xfs_fsize_t             new_size,
1018         int                     flags)
1019 {
1020         struct xfs_mount        *mp = ip->i_mount;
1021         struct xfs_trans        *tp = *tpp;
1022         xfs_fileoff_t           first_unmap_block;
1023         int                     error = 0;
1024 
1025         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
1026         if (atomic_read(&VFS_I(ip)->i_count))
1027                 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
1028         ASSERT(new_size <= XFS_ISIZE(ip));
1029         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1030         ASSERT(ip->i_itemp != NULL);
1031         ASSERT(ip->i_itemp->ili_lock_flags == 0);
1032         ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1033 
1034         trace_xfs_itruncate_extents_start(ip, new_size);
1035 
1036         flags |= xfs_bmapi_aflag(whichfork);
1037 
1038         /*
1039          * Since it is possible for space to become allocated beyond
1040          * the end of the file (in a crash where the space is allocated
1041          * but the inode size is not yet updated), simply remove any
1042          * blocks which show up between the new EOF and the maximum
1043          * possible file size.
1044          *
1045          * We have to free all the blocks to the bmbt maximum offset, even if
1046          * the page cache can't scale that far.
1047          */
1048         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1049         if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1050                 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1051                 return 0;
1052         }
1053 
1054         error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
1055                         XFS_MAX_FILEOFF);
1056         if (error)
1057                 goto out;
1058 
1059         if (whichfork == XFS_DATA_FORK) {
1060                 /* Remove all pending CoW reservations. */
1061                 error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1062                                 first_unmap_block, XFS_MAX_FILEOFF, true);
1063                 if (error)
1064                         goto out;
1065 
1066                 xfs_itruncate_clear_reflink_flags(ip);
1067         }
1068 
1069         /*
1070          * Always re-log the inode so that our permanent transaction can keep
1071          * on rolling it forward in the log.
1072          */
1073         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1074 
1075         trace_xfs_itruncate_extents_end(ip, new_size);
1076 
1077 out:
1078         *tpp = tp;
1079         return error;
1080 }
1081 
1082 int
1083 xfs_release(
1084         xfs_inode_t     *ip)
1085 {
1086         xfs_mount_t     *mp = ip->i_mount;
1087         int             error = 0;
1088 
1089         if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1090                 return 0;
1091 
1092         /* If this is a read-only mount, don't do this (would generate I/O) */
1093         if (xfs_is_readonly(mp))
1094                 return 0;
1095 
1096         if (!xfs_is_shutdown(mp)) {
1097                 int truncated;
1098 
1099                 /*
1100                  * If we previously truncated this file and removed old data
1101                  * in the process, we want to initiate "early" writeout on
1102                  * the last close.  This is an attempt to combat the notorious
1103                  * NULL files problem which is particularly noticeable from a
1104                  * truncate down, buffered (re-)write (delalloc), followed by
1105                  * a crash.  What we are effectively doing here is
1106                  * significantly reducing the time window where we'd otherwise
1107                  * be exposed to that problem.
1108                  */
1109                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1110                 if (truncated) {
1111                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1112                         if (ip->i_delayed_blks > 0) {
1113                                 error = filemap_flush(VFS_I(ip)->i_mapping);
1114                                 if (error)
1115                                         return error;
1116                         }
1117                 }
1118         }
1119 
1120         if (VFS_I(ip)->i_nlink == 0)
1121                 return 0;
1122 
1123         /*
1124          * If we can't get the iolock just skip truncating the blocks past EOF
1125          * because we could deadlock with the mmap_lock otherwise. We'll get
1126          * another chance to drop them once the last reference to the inode is
1127          * dropped, so we'll never leak blocks permanently.
1128          */
1129         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1130                 return 0;
1131 
1132         if (xfs_can_free_eofblocks(ip)) {
1133                 /*
1134                  * Check if the inode is being opened, written and closed
1135                  * frequently and we have delayed allocation blocks outstanding
1136                  * (e.g. streaming writes from the NFS server), truncating the
1137                  * blocks past EOF will cause fragmentation to occur.
1138                  *
1139                  * In this case don't do the truncation, but we have to be
1140                  * careful how we detect this case. Blocks beyond EOF show up as
1141                  * i_delayed_blks even when the inode is clean, so we need to
1142                  * truncate them away first before checking for a dirty release.
1143                  * Hence on the first dirty close we will still remove the
1144                  * speculative allocation, but after that we will leave it in
1145                  * place.
1146                  */
1147                 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1148                         goto out_unlock;
1149 
1150                 error = xfs_free_eofblocks(ip);
1151                 if (error)
1152                         goto out_unlock;
1153 
1154                 /* delalloc blocks after truncation means it really is dirty */
1155                 if (ip->i_delayed_blks)
1156                         xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1157         }
1158 
1159 out_unlock:
1160         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1161         return error;
1162 }
1163 
1164 /*
1165  * Mark all the buffers attached to this directory stale.  In theory we should
1166  * never be freeing a directory with any blocks at all, but this covers the
1167  * case where we've recovered a directory swap with a "temporary" directory
1168  * created by online repair and now need to dump it.
1169  */
1170 STATIC void
1171 xfs_inactive_dir(
1172         struct xfs_inode        *dp)
1173 {
1174         struct xfs_iext_cursor  icur;
1175         struct xfs_bmbt_irec    got;
1176         struct xfs_mount        *mp = dp->i_mount;
1177         struct xfs_da_geometry  *geo = mp->m_dir_geo;
1178         struct xfs_ifork        *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
1179         xfs_fileoff_t           off;
1180 
1181         /*
1182          * Invalidate each directory block.  All directory blocks are of
1183          * fsbcount length and alignment, so we only need to walk those same
1184          * offsets.  We hold the only reference to this inode, so we must wait
1185          * for the buffer locks.
1186          */
1187         for_each_xfs_iext(ifp, &icur, &got) {
1188                 for (off = round_up(got.br_startoff, geo->fsbcount);
1189                      off < got.br_startoff + got.br_blockcount;
1190                      off += geo->fsbcount) {
1191                         struct xfs_buf  *bp = NULL;
1192                         xfs_fsblock_t   fsbno;
1193                         int             error;
1194 
1195                         fsbno = (off - got.br_startoff) + got.br_startblock;
1196                         error = xfs_buf_incore(mp->m_ddev_targp,
1197                                         XFS_FSB_TO_DADDR(mp, fsbno),
1198                                         XFS_FSB_TO_BB(mp, geo->fsbcount),
1199                                         XBF_LIVESCAN, &bp);
1200                         if (error)
1201                                 continue;
1202 
1203                         xfs_buf_stale(bp);
1204                         xfs_buf_relse(bp);
1205                 }
1206         }
1207 }
1208 
1209 /*
1210  * xfs_inactive_truncate
1211  *
1212  * Called to perform a truncate when an inode becomes unlinked.
1213  */
1214 STATIC int
1215 xfs_inactive_truncate(
1216         struct xfs_inode *ip)
1217 {
1218         struct xfs_mount        *mp = ip->i_mount;
1219         struct xfs_trans        *tp;
1220         int                     error;
1221 
1222         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1223         if (error) {
1224                 ASSERT(xfs_is_shutdown(mp));
1225                 return error;
1226         }
1227         xfs_ilock(ip, XFS_ILOCK_EXCL);
1228         xfs_trans_ijoin(tp, ip, 0);
1229 
1230         /*
1231          * Log the inode size first to prevent stale data exposure in the event
1232          * of a system crash before the truncate completes. See the related
1233          * comment in xfs_vn_setattr_size() for details.
1234          */
1235         ip->i_disk_size = 0;
1236         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1237 
1238         error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1239         if (error)
1240                 goto error_trans_cancel;
1241 
1242         ASSERT(ip->i_df.if_nextents == 0);
1243 
1244         error = xfs_trans_commit(tp);
1245         if (error)
1246                 goto error_unlock;
1247 
1248         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1249         return 0;
1250 
1251 error_trans_cancel:
1252         xfs_trans_cancel(tp);
1253 error_unlock:
1254         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1255         return error;
1256 }
1257 
1258 /*
1259  * xfs_inactive_ifree()
1260  *
1261  * Perform the inode free when an inode is unlinked.
1262  */
1263 STATIC int
1264 xfs_inactive_ifree(
1265         struct xfs_inode *ip)
1266 {
1267         struct xfs_mount        *mp = ip->i_mount;
1268         struct xfs_trans        *tp;
1269         int                     error;
1270 
1271         /*
1272          * We try to use a per-AG reservation for any block needed by the finobt
1273          * tree, but as the finobt feature predates the per-AG reservation
1274          * support a degraded file system might not have enough space for the
1275          * reservation at mount time.  In that case try to dip into the reserved
1276          * pool and pray.
1277          *
1278          * Send a warning if the reservation does happen to fail, as the inode
1279          * now remains allocated and sits on the unlinked list until the fs is
1280          * repaired.
1281          */
1282         if (unlikely(mp->m_finobt_nores)) {
1283                 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1284                                 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1285                                 &tp);
1286         } else {
1287                 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1288         }
1289         if (error) {
1290                 if (error == -ENOSPC) {
1291                         xfs_warn_ratelimited(mp,
1292                         "Failed to remove inode(s) from unlinked list. "
1293                         "Please free space, unmount and run xfs_repair.");
1294                 } else {
1295                         ASSERT(xfs_is_shutdown(mp));
1296                 }
1297                 return error;
1298         }
1299 
1300         /*
1301          * We do not hold the inode locked across the entire rolling transaction
1302          * here. We only need to hold it for the first transaction that
1303          * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1304          * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1305          * here breaks the relationship between cluster buffer invalidation and
1306          * stale inode invalidation on cluster buffer item journal commit
1307          * completion, and can result in leaving dirty stale inodes hanging
1308          * around in memory.
1309          *
1310          * We have no need for serialising this inode operation against other
1311          * operations - we freed the inode and hence reallocation is required
1312          * and that will serialise on reallocating the space the deferops need
1313          * to free. Hence we can unlock the inode on the first commit of
1314          * the transaction rather than roll it right through the deferops. This
1315          * avoids relogging the XFS_ISTALE inode.
1316          *
1317          * We check that xfs_ifree() hasn't grown an internal transaction roll
1318          * by asserting that the inode is still locked when it returns.
1319          */
1320         xfs_ilock(ip, XFS_ILOCK_EXCL);
1321         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1322 
1323         error = xfs_ifree(tp, ip);
1324         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
1325         if (error) {
1326                 /*
1327                  * If we fail to free the inode, shut down.  The cancel
1328                  * might do that, we need to make sure.  Otherwise the
1329                  * inode might be lost for a long time or forever.
1330                  */
1331                 if (!xfs_is_shutdown(mp)) {
1332                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
1333                                 __func__, error);
1334                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1335                 }
1336                 xfs_trans_cancel(tp);
1337                 return error;
1338         }
1339 
1340         /*
1341          * Credit the quota account(s). The inode is gone.
1342          */
1343         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1344 
1345         return xfs_trans_commit(tp);
1346 }
1347 
1348 /*
1349  * Returns true if we need to update the on-disk metadata before we can free
1350  * the memory used by this inode.  Updates include freeing post-eof
1351  * preallocations; freeing COW staging extents; and marking the inode free in
1352  * the inobt if it is on the unlinked list.
1353  */
1354 bool
1355 xfs_inode_needs_inactive(
1356         struct xfs_inode        *ip)
1357 {
1358         struct xfs_mount        *mp = ip->i_mount;
1359         struct xfs_ifork        *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
1360 
1361         /*
1362          * If the inode is already free, then there can be nothing
1363          * to clean up here.
1364          */
1365         if (VFS_I(ip)->i_mode == 0)
1366                 return false;
1367 
1368         /*
1369          * If this is a read-only mount, don't do this (would generate I/O)
1370          * unless we're in log recovery and cleaning the iunlinked list.
1371          */
1372         if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
1373                 return false;
1374 
1375         /* If the log isn't running, push inodes straight to reclaim. */
1376         if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
1377                 return false;
1378 
1379         /* Metadata inodes require explicit resource cleanup. */
1380         if (xfs_is_metadata_inode(ip))
1381                 return false;
1382 
1383         /* Want to clean out the cow blocks if there are any. */
1384         if (cow_ifp && cow_ifp->if_bytes > 0)
1385                 return true;
1386 
1387         /* Unlinked files must be freed. */
1388         if (VFS_I(ip)->i_nlink == 0)
1389                 return true;
1390 
1391         /*
1392          * This file isn't being freed, so check if there are post-eof blocks
1393          * to free.
1394          *
1395          * Note: don't bother with iolock here since lockdep complains about
1396          * acquiring it in reclaim context. We have the only reference to the
1397          * inode at this point anyways.
1398          */
1399         return xfs_can_free_eofblocks(ip);
1400 }
1401 
1402 /*
1403  * Save health status somewhere, if we're dumping an inode with uncorrected
1404  * errors and online repair isn't running.
1405  */
1406 static inline void
1407 xfs_inactive_health(
1408         struct xfs_inode        *ip)
1409 {
1410         struct xfs_mount        *mp = ip->i_mount;
1411         struct xfs_perag        *pag;
1412         unsigned int            sick;
1413         unsigned int            checked;
1414 
1415         xfs_inode_measure_sickness(ip, &sick, &checked);
1416         if (!sick)
1417                 return;
1418 
1419         trace_xfs_inode_unfixed_corruption(ip, sick);
1420 
1421         if (sick & XFS_SICK_INO_FORGET)
1422                 return;
1423 
1424         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1425         if (!pag) {
1426                 /* There had better still be a perag structure! */
1427                 ASSERT(0);
1428                 return;
1429         }
1430 
1431         xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES);
1432         xfs_perag_put(pag);
1433 }
1434 
1435 /*
1436  * xfs_inactive
1437  *
1438  * This is called when the vnode reference count for the vnode
1439  * goes to zero.  If the file has been unlinked, then it must
1440  * now be truncated.  Also, we clear all of the read-ahead state
1441  * kept for the inode here since the file is now closed.
1442  */
1443 int
1444 xfs_inactive(
1445         xfs_inode_t     *ip)
1446 {
1447         struct xfs_mount        *mp;
1448         int                     error = 0;
1449         int                     truncate = 0;
1450 
1451         /*
1452          * If the inode is already free, then there can be nothing
1453          * to clean up here.
1454          */
1455         if (VFS_I(ip)->i_mode == 0) {
1456                 ASSERT(ip->i_df.if_broot_bytes == 0);
1457                 goto out;
1458         }
1459 
1460         mp = ip->i_mount;
1461         ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1462 
1463         xfs_inactive_health(ip);
1464 
1465         /*
1466          * If this is a read-only mount, don't do this (would generate I/O)
1467          * unless we're in log recovery and cleaning the iunlinked list.
1468          */
1469         if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
1470                 goto out;
1471 
1472         /* Metadata inodes require explicit resource cleanup. */
1473         if (xfs_is_metadata_inode(ip))
1474                 goto out;
1475 
1476         /* Try to clean out the cow blocks if there are any. */
1477         if (xfs_inode_has_cow_data(ip))
1478                 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1479 
1480         if (VFS_I(ip)->i_nlink != 0) {
1481                 /*
1482                  * Note: don't bother with iolock here since lockdep complains
1483                  * about acquiring it in reclaim context. We have the only
1484                  * reference to the inode at this point anyways.
1485                  */
1486                 if (xfs_can_free_eofblocks(ip))
1487                         error = xfs_free_eofblocks(ip);
1488 
1489                 goto out;
1490         }
1491 
1492         if (S_ISREG(VFS_I(ip)->i_mode) &&
1493             (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
1494              ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1495                 truncate = 1;
1496 
1497         if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
1498                 /*
1499                  * If this inode is being inactivated during a quotacheck and
1500                  * has not yet been scanned by quotacheck, we /must/ remove
1501                  * the dquots from the inode before inactivation changes the
1502                  * block and inode counts.  Most probably this is a result of
1503                  * reloading the incore iunlinked list to purge unrecovered
1504                  * unlinked inodes.
1505                  */
1506                 xfs_qm_dqdetach(ip);
1507         } else {
1508                 error = xfs_qm_dqattach(ip);
1509                 if (error)
1510                         goto out;
1511         }
1512 
1513         if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) {
1514                 xfs_inactive_dir(ip);
1515                 truncate = 1;
1516         }
1517 
1518         if (S_ISLNK(VFS_I(ip)->i_mode))
1519                 error = xfs_inactive_symlink(ip);
1520         else if (truncate)
1521                 error = xfs_inactive_truncate(ip);
1522         if (error)
1523                 goto out;
1524 
1525         /*
1526          * If there are attributes associated with the file then blow them away
1527          * now.  The code calls a routine that recursively deconstructs the
1528          * attribute fork. If also blows away the in-core attribute fork.
1529          */
1530         if (xfs_inode_has_attr_fork(ip)) {
1531                 error = xfs_attr_inactive(ip);
1532                 if (error)
1533                         goto out;
1534         }
1535 
1536         ASSERT(ip->i_forkoff == 0);
1537 
1538         /*
1539          * Free the inode.
1540          */
1541         error = xfs_inactive_ifree(ip);
1542 
1543 out:
1544         /*
1545          * We're done making metadata updates for this inode, so we can release
1546          * the attached dquots.
1547          */
1548         xfs_qm_dqdetach(ip);
1549         return error;
1550 }
1551 
1552 /*
1553  * Find an inode on the unlinked list. This does not take references to the
1554  * inode as we have existence guarantees by holding the AGI buffer lock and that
1555  * only unlinked, referenced inodes can be on the unlinked inode list.  If we
1556  * don't find the inode in cache, then let the caller handle the situation.
1557  */
1558 struct xfs_inode *
1559 xfs_iunlink_lookup(
1560         struct xfs_perag        *pag,
1561         xfs_agino_t             agino)
1562 {
1563         struct xfs_inode        *ip;
1564 
1565         rcu_read_lock();
1566         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1567         if (!ip) {
1568                 /* Caller can handle inode not being in memory. */
1569                 rcu_read_unlock();
1570                 return NULL;
1571         }
1572 
1573         /*
1574          * Inode in RCU freeing limbo should not happen.  Warn about this and
1575          * let the caller handle the failure.
1576          */
1577         if (WARN_ON_ONCE(!ip->i_ino)) {
1578                 rcu_read_unlock();
1579                 return NULL;
1580         }
1581         ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
1582         rcu_read_unlock();
1583         return ip;
1584 }
1585 
1586 /*
1587  * Load the inode @next_agino into the cache and set its prev_unlinked pointer
1588  * to @prev_agino.  Caller must hold the AGI to synchronize with other changes
1589  * to the unlinked list.
1590  */
1591 int
1592 xfs_iunlink_reload_next(
1593         struct xfs_trans        *tp,
1594         struct xfs_buf          *agibp,
1595         xfs_agino_t             prev_agino,
1596         xfs_agino_t             next_agino)
1597 {
1598         struct xfs_perag        *pag = agibp->b_pag;
1599         struct xfs_mount        *mp = pag->pag_mount;
1600         struct xfs_inode        *next_ip = NULL;
1601         xfs_ino_t               ino;
1602         int                     error;
1603 
1604         ASSERT(next_agino != NULLAGINO);
1605 
1606 #ifdef DEBUG
1607         rcu_read_lock();
1608         next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
1609         ASSERT(next_ip == NULL);
1610         rcu_read_unlock();
1611 #endif
1612 
1613         xfs_info_ratelimited(mp,
1614  "Found unrecovered unlinked inode 0x%x in AG 0x%x.  Initiating recovery.",
1615                         next_agino, pag->pag_agno);
1616 
1617         /*
1618          * Use an untrusted lookup just to be cautious in case the AGI has been
1619          * corrupted and now points at a free inode.  That shouldn't happen,
1620          * but we'd rather shut down now since we're already running in a weird
1621          * situation.
1622          */
1623         ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
1624         error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
1625         if (error) {
1626                 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
1627                 return error;
1628         }
1629 
1630         /* If this is not an unlinked inode, something is very wrong. */
1631         if (VFS_I(next_ip)->i_nlink != 0) {
1632                 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
1633                 error = -EFSCORRUPTED;
1634                 goto rele;
1635         }
1636 
1637         next_ip->i_prev_unlinked = prev_agino;
1638         trace_xfs_iunlink_reload_next(next_ip);
1639 rele:
1640         ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
1641         if (xfs_is_quotacheck_running(mp) && next_ip)
1642                 xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
1643         xfs_irele(next_ip);
1644         return error;
1645 }
1646 
1647 /*
1648  * Look up the inode number specified and if it is not already marked XFS_ISTALE
1649  * mark it stale. We should only find clean inodes in this lookup that aren't
1650  * already stale.
1651  */
1652 static void
1653 xfs_ifree_mark_inode_stale(
1654         struct xfs_perag        *pag,
1655         struct xfs_inode        *free_ip,
1656         xfs_ino_t               inum)
1657 {
1658         struct xfs_mount        *mp = pag->pag_mount;
1659         struct xfs_inode_log_item *iip;
1660         struct xfs_inode        *ip;
1661 
1662 retry:
1663         rcu_read_lock();
1664         ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
1665 
1666         /* Inode not in memory, nothing to do */
1667         if (!ip) {
1668                 rcu_read_unlock();
1669                 return;
1670         }
1671 
1672         /*
1673          * because this is an RCU protected lookup, we could find a recently
1674          * freed or even reallocated inode during the lookup. We need to check
1675          * under the i_flags_lock for a valid inode here. Skip it if it is not
1676          * valid, the wrong inode or stale.
1677          */
1678         spin_lock(&ip->i_flags_lock);
1679         if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
1680                 goto out_iflags_unlock;
1681 
1682         /*
1683          * Don't try to lock/unlock the current inode, but we _cannot_ skip the
1684          * other inodes that we did not find in the list attached to the buffer
1685          * and are not already marked stale. If we can't lock it, back off and
1686          * retry.
1687          */
1688         if (ip != free_ip) {
1689                 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1690                         spin_unlock(&ip->i_flags_lock);
1691                         rcu_read_unlock();
1692                         delay(1);
1693                         goto retry;
1694                 }
1695         }
1696         ip->i_flags |= XFS_ISTALE;
1697 
1698         /*
1699          * If the inode is flushing, it is already attached to the buffer.  All
1700          * we needed to do here is mark the inode stale so buffer IO completion
1701          * will remove it from the AIL.
1702          */
1703         iip = ip->i_itemp;
1704         if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
1705                 ASSERT(!list_empty(&iip->ili_item.li_bio_list));
1706                 ASSERT(iip->ili_last_fields);
1707                 goto out_iunlock;
1708         }
1709 
1710         /*
1711          * Inodes not attached to the buffer can be released immediately.
1712          * Everything else has to go through xfs_iflush_abort() on journal
1713          * commit as the flock synchronises removal of the inode from the
1714          * cluster buffer against inode reclaim.
1715          */
1716         if (!iip || list_empty(&iip->ili_item.li_bio_list))
1717                 goto out_iunlock;
1718 
1719         __xfs_iflags_set(ip, XFS_IFLUSHING);
1720         spin_unlock(&ip->i_flags_lock);
1721         rcu_read_unlock();
1722 
1723         /* we have a dirty inode in memory that has not yet been flushed. */
1724         spin_lock(&iip->ili_lock);
1725         iip->ili_last_fields = iip->ili_fields;
1726         iip->ili_fields = 0;
1727         iip->ili_fsync_fields = 0;
1728         spin_unlock(&iip->ili_lock);
1729         ASSERT(iip->ili_last_fields);
1730 
1731         if (ip != free_ip)
1732                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1733         return;
1734 
1735 out_iunlock:
1736         if (ip != free_ip)
1737                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1738 out_iflags_unlock:
1739         spin_unlock(&ip->i_flags_lock);
1740         rcu_read_unlock();
1741 }
1742 
1743 /*
1744  * A big issue when freeing the inode cluster is that we _cannot_ skip any
1745  * inodes that are in memory - they all must be marked stale and attached to
1746  * the cluster buffer.
1747  */
1748 static int
1749 xfs_ifree_cluster(
1750         struct xfs_trans        *tp,
1751         struct xfs_perag        *pag,
1752         struct xfs_inode        *free_ip,
1753         struct xfs_icluster     *xic)
1754 {
1755         struct xfs_mount        *mp = free_ip->i_mount;
1756         struct xfs_ino_geometry *igeo = M_IGEO(mp);
1757         struct xfs_buf          *bp;
1758         xfs_daddr_t             blkno;
1759         xfs_ino_t               inum = xic->first_ino;
1760         int                     nbufs;
1761         int                     i, j;
1762         int                     ioffset;
1763         int                     error;
1764 
1765         nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
1766 
1767         for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
1768                 /*
1769                  * The allocation bitmap tells us which inodes of the chunk were
1770                  * physically allocated. Skip the cluster if an inode falls into
1771                  * a sparse region.
1772                  */
1773                 ioffset = inum - xic->first_ino;
1774                 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
1775                         ASSERT(ioffset % igeo->inodes_per_cluster == 0);
1776                         continue;
1777                 }
1778 
1779                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1780                                          XFS_INO_TO_AGBNO(mp, inum));
1781 
1782                 /*
1783                  * We obtain and lock the backing buffer first in the process
1784                  * here to ensure dirty inodes attached to the buffer remain in
1785                  * the flushing state while we mark them stale.
1786                  *
1787                  * If we scan the in-memory inodes first, then buffer IO can
1788                  * complete before we get a lock on it, and hence we may fail
1789                  * to mark all the active inodes on the buffer stale.
1790                  */
1791                 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1792                                 mp->m_bsize * igeo->blocks_per_cluster,
1793                                 XBF_UNMAPPED, &bp);
1794                 if (error)
1795                         return error;
1796 
1797                 /*
1798                  * This buffer may not have been correctly initialised as we
1799                  * didn't read it from disk. That's not important because we are
1800                  * only using to mark the buffer as stale in the log, and to
1801                  * attach stale cached inodes on it.
1802                  *
1803                  * For the inode that triggered the cluster freeing, this
1804                  * attachment may occur in xfs_inode_item_precommit() after we
1805                  * have marked this buffer stale.  If this buffer was not in
1806                  * memory before xfs_ifree_cluster() started, it will not be
1807                  * marked XBF_DONE and this will cause problems later in
1808                  * xfs_inode_item_precommit() when we trip over a (stale, !done)
1809                  * buffer to attached to the transaction.
1810                  *
1811                  * Hence we have to mark the buffer as XFS_DONE here. This is
1812                  * safe because we are also marking the buffer as XBF_STALE and
1813                  * XFS_BLI_STALE. That means it will never be dispatched for
1814                  * IO and it won't be unlocked until the cluster freeing has
1815                  * been committed to the journal and the buffer unpinned. If it
1816                  * is written, we want to know about it, and we want it to
1817                  * fail. We can acheive this by adding a write verifier to the
1818                  * buffer.
1819                  */
1820                 bp->b_flags |= XBF_DONE;
1821                 bp->b_ops = &xfs_inode_buf_ops;
1822 
1823                 /*
1824                  * Now we need to set all the cached clean inodes as XFS_ISTALE,
1825                  * too. This requires lookups, and will skip inodes that we've
1826                  * already marked XFS_ISTALE.
1827                  */
1828                 for (i = 0; i < igeo->inodes_per_cluster; i++)
1829                         xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
1830 
1831                 xfs_trans_stale_inode_buf(tp, bp);
1832                 xfs_trans_binval(tp, bp);
1833         }
1834         return 0;
1835 }
1836 
1837 /*
1838  * This is called to return an inode to the inode free list.  The inode should
1839  * already be truncated to 0 length and have no pages associated with it.  This
1840  * routine also assumes that the inode is already a part of the transaction.
1841  *
1842  * The on-disk copy of the inode will have been added to the list of unlinked
1843  * inodes in the AGI. We need to remove the inode from that list atomically with
1844  * respect to freeing it here.
1845  */
1846 int
1847 xfs_ifree(
1848         struct xfs_trans        *tp,
1849         struct xfs_inode        *ip)
1850 {
1851         struct xfs_mount        *mp = ip->i_mount;
1852         struct xfs_perag        *pag;
1853         struct xfs_icluster     xic = { 0 };
1854         struct xfs_inode_log_item *iip = ip->i_itemp;
1855         int                     error;
1856 
1857         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
1858         ASSERT(VFS_I(ip)->i_nlink == 0);
1859         ASSERT(ip->i_df.if_nextents == 0);
1860         ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
1861         ASSERT(ip->i_nblocks == 0);
1862 
1863         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1864 
1865         error = xfs_inode_uninit(tp, pag, ip, &xic);
1866         if (error)
1867                 goto out;
1868 
1869         if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
1870                 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
1871 
1872         /* Don't attempt to replay owner changes for a deleted inode */
1873         spin_lock(&iip->ili_lock);
1874         iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
1875         spin_unlock(&iip->ili_lock);
1876 
1877         if (xic.deleted)
1878                 error = xfs_ifree_cluster(tp, pag, ip, &xic);
1879 out:
1880         xfs_perag_put(pag);
1881         return error;
1882 }
1883 
1884 /*
1885  * This is called to unpin an inode.  The caller must have the inode locked
1886  * in at least shared mode so that the buffer cannot be subsequently pinned
1887  * once someone is waiting for it to be unpinned.
1888  */
1889 static void
1890 xfs_iunpin(
1891         struct xfs_inode        *ip)
1892 {
1893         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
1894 
1895         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
1896 
1897         /* Give the log a push to start the unpinning I/O */
1898         xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
1899 
1900 }
1901 
1902 static void
1903 __xfs_iunpin_wait(
1904         struct xfs_inode        *ip)
1905 {
1906         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
1907         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
1908 
1909         xfs_iunpin(ip);
1910 
1911         do {
1912                 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1913                 if (xfs_ipincount(ip))
1914                         io_schedule();
1915         } while (xfs_ipincount(ip));
1916         finish_wait(wq, &wait.wq_entry);
1917 }
1918 
1919 void
1920 xfs_iunpin_wait(
1921         struct xfs_inode        *ip)
1922 {
1923         if (xfs_ipincount(ip))
1924                 __xfs_iunpin_wait(ip);
1925 }
1926 
1927 /*
1928  * Removing an inode from the namespace involves removing the directory entry
1929  * and dropping the link count on the inode. Removing the directory entry can
1930  * result in locking an AGF (directory blocks were freed) and removing a link
1931  * count can result in placing the inode on an unlinked list which results in
1932  * locking an AGI.
1933  *
1934  * The big problem here is that we have an ordering constraint on AGF and AGI
1935  * locking - inode allocation locks the AGI, then can allocate a new extent for
1936  * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
1937  * removes the inode from the unlinked list, requiring that we lock the AGI
1938  * first, and then freeing the inode can result in an inode chunk being freed
1939  * and hence freeing disk space requiring that we lock an AGF.
1940  *
1941  * Hence the ordering that is imposed by other parts of the code is AGI before
1942  * AGF. This means we cannot remove the directory entry before we drop the inode
1943  * reference count and put it on the unlinked list as this results in a lock
1944  * order of AGF then AGI, and this can deadlock against inode allocation and
1945  * freeing. Therefore we must drop the link counts before we remove the
1946  * directory entry.
1947  *
1948  * This is still safe from a transactional point of view - it is not until we
1949  * get to xfs_defer_finish() that we have the possibility of multiple
1950  * transactions in this operation. Hence as long as we remove the directory
1951  * entry and drop the link count in the first transaction of the remove
1952  * operation, there are no transactional constraints on the ordering here.
1953  */
1954 int
1955 xfs_remove(
1956         struct xfs_inode        *dp,
1957         struct xfs_name         *name,
1958         struct xfs_inode        *ip)
1959 {
1960         struct xfs_dir_update   du = {
1961                 .dp             = dp,
1962                 .name           = name,
1963                 .ip             = ip,
1964         };
1965         struct xfs_mount        *mp = dp->i_mount;
1966         struct xfs_trans        *tp = NULL;
1967         int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
1968         int                     dontcare;
1969         int                     error = 0;
1970         uint                    resblks;
1971 
1972         trace_xfs_remove(dp, name);
1973 
1974         if (xfs_is_shutdown(mp))
1975                 return -EIO;
1976         if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
1977                 return -EIO;
1978 
1979         error = xfs_qm_dqattach(dp);
1980         if (error)
1981                 goto std_return;
1982 
1983         error = xfs_qm_dqattach(ip);
1984         if (error)
1985                 goto std_return;
1986 
1987         error = xfs_parent_start(mp, &du.ppargs);
1988         if (error)
1989                 goto std_return;
1990 
1991         /*
1992          * We try to get the real space reservation first, allowing for
1993          * directory btree deletion(s) implying possible bmap insert(s).  If we
1994          * can't get the space reservation then we use 0 instead, and avoid the
1995          * bmap btree insert(s) in the directory code by, if the bmap insert
1996          * tries to happen, instead trimming the LAST block from the directory.
1997          *
1998          * Ignore EDQUOT and ENOSPC being returned via nospace_error because
1999          * the directory code can handle a reservationless update and we don't
2000          * want to prevent a user from trying to free space by deleting things.
2001          */
2002         resblks = xfs_remove_space_res(mp, name->len);
2003         error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
2004                         &tp, &dontcare);
2005         if (error) {
2006                 ASSERT(error != -ENOSPC);
2007                 goto out_parent;
2008         }
2009 
2010         error = xfs_dir_remove_child(tp, resblks, &du);
2011         if (error)
2012                 goto out_trans_cancel;
2013 
2014         /*
2015          * If this is a synchronous mount, make sure that the
2016          * remove transaction goes to disk before returning to
2017          * the user.
2018          */
2019         if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
2020                 xfs_trans_set_sync(tp);
2021 
2022         error = xfs_trans_commit(tp);
2023         if (error)
2024                 goto out_unlock;
2025 
2026         if (is_dir && xfs_inode_is_filestream(ip))
2027                 xfs_filestream_deassociate(ip);
2028 
2029         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2030         xfs_iunlock(dp, XFS_ILOCK_EXCL);
2031         xfs_parent_finish(mp, du.ppargs);
2032         return 0;
2033 
2034  out_trans_cancel:
2035         xfs_trans_cancel(tp);
2036  out_unlock:
2037         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2038         xfs_iunlock(dp, XFS_ILOCK_EXCL);
2039  out_parent:
2040         xfs_parent_finish(mp, du.ppargs);
2041  std_return:
2042         return error;
2043 }
2044 
2045 static inline void
2046 xfs_iunlock_rename(
2047         struct xfs_inode        **i_tab,
2048         int                     num_inodes)
2049 {
2050         int                     i;
2051 
2052         for (i = num_inodes - 1; i >= 0; i--) {
2053                 /* Skip duplicate inodes if src and target dps are the same */
2054                 if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1]))
2055                         continue;
2056                 xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL);
2057         }
2058 }
2059 
2060 /*
2061  * Enter all inodes for a rename transaction into a sorted array.
2062  */
2063 #define __XFS_SORT_INODES       5
2064 STATIC void
2065 xfs_sort_for_rename(
2066         struct xfs_inode        *dp1,   /* in: old (source) directory inode */
2067         struct xfs_inode        *dp2,   /* in: new (target) directory inode */
2068         struct xfs_inode        *ip1,   /* in: inode of old entry */
2069         struct xfs_inode        *ip2,   /* in: inode of new entry */
2070         struct xfs_inode        *wip,   /* in: whiteout inode */
2071         struct xfs_inode        **i_tab,/* out: sorted array of inodes */
2072         int                     *num_inodes)  /* in/out: inodes in array */
2073 {
2074         int                     i;
2075 
2076         ASSERT(*num_inodes == __XFS_SORT_INODES);
2077         memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2078 
2079         /*
2080          * i_tab contains a list of pointers to inodes.  We initialize
2081          * the table here & we'll sort it.  We will then use it to
2082          * order the acquisition of the inode locks.
2083          *
2084          * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2085          */
2086         i = 0;
2087         i_tab[i++] = dp1;
2088         i_tab[i++] = dp2;
2089         i_tab[i++] = ip1;
2090         if (ip2)
2091                 i_tab[i++] = ip2;
2092         if (wip)
2093                 i_tab[i++] = wip;
2094         *num_inodes = i;
2095 
2096         xfs_sort_inodes(i_tab, *num_inodes);
2097 }
2098 
2099 void
2100 xfs_sort_inodes(
2101         struct xfs_inode        **i_tab,
2102         unsigned int            num_inodes)
2103 {
2104         int                     i, j;
2105 
2106         ASSERT(num_inodes <= __XFS_SORT_INODES);
2107 
2108         /*
2109          * Sort the elements via bubble sort.  (Remember, there are at
2110          * most 5 elements to sort, so this is adequate.)
2111          */
2112         for (i = 0; i < num_inodes; i++) {
2113                 for (j = 1; j < num_inodes; j++) {
2114                         if (i_tab[j]->i_ino < i_tab[j-1]->i_ino)
2115                                 swap(i_tab[j], i_tab[j - 1]);
2116                 }
2117         }
2118 }
2119 
2120 /*
2121  * xfs_rename_alloc_whiteout()
2122  *
2123  * Return a referenced, unlinked, unlocked inode that can be used as a
2124  * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2125  * crash between allocating the inode and linking it into the rename transaction
2126  * recovery will free the inode and we won't leak it.
2127  */
2128 static int
2129 xfs_rename_alloc_whiteout(
2130         struct mnt_idmap        *idmap,
2131         struct xfs_name         *src_name,
2132         struct xfs_inode        *dp,
2133         struct xfs_inode        **wip)
2134 {
2135         struct xfs_icreate_args args = {
2136                 .idmap          = idmap,
2137                 .pip            = dp,
2138                 .mode           = S_IFCHR | WHITEOUT_MODE,
2139                 .flags          = XFS_ICREATE_TMPFILE,
2140         };
2141         struct xfs_inode        *tmpfile;
2142         struct qstr             name;
2143         int                     error;
2144 
2145         error = xfs_create_tmpfile(&args, &tmpfile);
2146         if (error)
2147                 return error;
2148 
2149         name.name = src_name->name;
2150         name.len = src_name->len;
2151         error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
2152         if (error) {
2153                 xfs_finish_inode_setup(tmpfile);
2154                 xfs_irele(tmpfile);
2155                 return error;
2156         }
2157 
2158         /*
2159          * Prepare the tmpfile inode as if it were created through the VFS.
2160          * Complete the inode setup and flag it as linkable.  nlink is already
2161          * zero, so we can skip the drop_nlink.
2162          */
2163         xfs_setup_iops(tmpfile);
2164         xfs_finish_inode_setup(tmpfile);
2165         VFS_I(tmpfile)->i_state |= I_LINKABLE;
2166 
2167         *wip = tmpfile;
2168         return 0;
2169 }
2170 
2171 /*
2172  * xfs_rename
2173  */
2174 int
2175 xfs_rename(
2176         struct mnt_idmap        *idmap,
2177         struct xfs_inode        *src_dp,
2178         struct xfs_name         *src_name,
2179         struct xfs_inode        *src_ip,
2180         struct xfs_inode        *target_dp,
2181         struct xfs_name         *target_name,
2182         struct xfs_inode        *target_ip,
2183         unsigned int            flags)
2184 {
2185         struct xfs_dir_update   du_src = {
2186                 .dp             = src_dp,
2187                 .name           = src_name,
2188                 .ip             = src_ip,
2189         };
2190         struct xfs_dir_update   du_tgt = {
2191                 .dp             = target_dp,
2192                 .name           = target_name,
2193                 .ip             = target_ip,
2194         };
2195         struct xfs_dir_update   du_wip = { };
2196         struct xfs_mount        *mp = src_dp->i_mount;
2197         struct xfs_trans        *tp;
2198         struct xfs_inode        *inodes[__XFS_SORT_INODES];
2199         int                     i;
2200         int                     num_inodes = __XFS_SORT_INODES;
2201         bool                    new_parent = (src_dp != target_dp);
2202         bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2203         int                     spaceres;
2204         bool                    retried = false;
2205         int                     error, nospace_error = 0;
2206 
2207         trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2208 
2209         if ((flags & RENAME_EXCHANGE) && !target_ip)
2210                 return -EINVAL;
2211 
2212         /*
2213          * If we are doing a whiteout operation, allocate the whiteout inode
2214          * we will be placing at the target and ensure the type is set
2215          * appropriately.
2216          */
2217         if (flags & RENAME_WHITEOUT) {
2218                 error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp,
2219                                 &du_wip.ip);
2220                 if (error)
2221                         return error;
2222 
2223                 /* setup target dirent info as whiteout */
2224                 src_name->type = XFS_DIR3_FT_CHRDEV;
2225         }
2226 
2227         xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, du_wip.ip,
2228                         inodes, &num_inodes);
2229 
2230         error = xfs_parent_start(mp, &du_src.ppargs);
2231         if (error)
2232                 goto out_release_wip;
2233 
2234         if (du_wip.ip) {
2235                 error = xfs_parent_start(mp, &du_wip.ppargs);
2236                 if (error)
2237                         goto out_src_ppargs;
2238         }
2239 
2240         if (target_ip) {
2241                 error = xfs_parent_start(mp, &du_tgt.ppargs);
2242                 if (error)
2243                         goto out_wip_ppargs;
2244         }
2245 
2246 retry:
2247         nospace_error = 0;
2248         spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL,
2249                         target_name->len, du_wip.ip != NULL);
2250         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
2251         if (error == -ENOSPC) {
2252                 nospace_error = error;
2253                 spaceres = 0;
2254                 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
2255                                 &tp);
2256         }
2257         if (error)
2258                 goto out_tgt_ppargs;
2259 
2260         /*
2261          * We don't allow reservationless renaming when parent pointers are
2262          * enabled because we can't back out if the xattrs must grow.
2263          */
2264         if (du_src.ppargs && nospace_error) {
2265                 error = nospace_error;
2266                 xfs_trans_cancel(tp);
2267                 goto out_tgt_ppargs;
2268         }
2269 
2270         /*
2271          * Attach the dquots to the inodes
2272          */
2273         error = xfs_qm_vop_rename_dqattach(inodes);
2274         if (error) {
2275                 xfs_trans_cancel(tp);
2276                 goto out_tgt_ppargs;
2277         }
2278 
2279         /*
2280          * Lock all the participating inodes. Depending upon whether
2281          * the target_name exists in the target directory, and
2282          * whether the target directory is the same as the source
2283          * directory, we can lock from 2 to 5 inodes.
2284          */
2285         xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2286 
2287         /*
2288          * Join all the inodes to the transaction.
2289          */
2290         xfs_trans_ijoin(tp, src_dp, 0);
2291         if (new_parent)
2292                 xfs_trans_ijoin(tp, target_dp, 0);
2293         xfs_trans_ijoin(tp, src_ip, 0);
2294         if (target_ip)
2295                 xfs_trans_ijoin(tp, target_ip, 0);
2296         if (du_wip.ip)
2297                 xfs_trans_ijoin(tp, du_wip.ip, 0);
2298 
2299         /*
2300          * If we are using project inheritance, we only allow renames
2301          * into our tree when the project IDs are the same; else the
2302          * tree quota mechanism would be circumvented.
2303          */
2304         if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
2305                      target_dp->i_projid != src_ip->i_projid)) {
2306                 error = -EXDEV;
2307                 goto out_trans_cancel;
2308         }
2309 
2310         /* RENAME_EXCHANGE is unique from here on. */
2311         if (flags & RENAME_EXCHANGE) {
2312                 error = xfs_dir_exchange_children(tp, &du_src, &du_tgt,
2313                                 spaceres);
2314                 if (error)
2315                         goto out_trans_cancel;
2316                 goto out_commit;
2317         }
2318 
2319         /*
2320          * Try to reserve quota to handle an expansion of the target directory.
2321          * We'll allow the rename to continue in reservationless mode if we hit
2322          * a space usage constraint.  If we trigger reservationless mode, save
2323          * the errno if there isn't any free space in the target directory.
2324          */
2325         if (spaceres != 0) {
2326                 error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
2327                                 0, false);
2328                 if (error == -EDQUOT || error == -ENOSPC) {
2329                         if (!retried) {
2330                                 xfs_trans_cancel(tp);
2331                                 xfs_iunlock_rename(inodes, num_inodes);
2332                                 xfs_blockgc_free_quota(target_dp, 0);
2333                                 retried = true;
2334                                 goto retry;
2335                         }
2336 
2337                         nospace_error = error;
2338                         spaceres = 0;
2339                         error = 0;
2340                 }
2341                 if (error)
2342                         goto out_trans_cancel;
2343         }
2344 
2345         /*
2346          * We don't allow quotaless renaming when parent pointers are enabled
2347          * because we can't back out if the xattrs must grow.
2348          */
2349         if (du_src.ppargs && nospace_error) {
2350                 error = nospace_error;
2351                 goto out_trans_cancel;
2352         }
2353 
2354         /*
2355          * Lock the AGI buffers we need to handle bumping the nlink of the
2356          * whiteout inode off the unlinked list and to handle dropping the
2357          * nlink of the target inode.  Per locking order rules, do this in
2358          * increasing AG order and before directory block allocation tries to
2359          * grab AGFs because we grab AGIs before AGFs.
2360          *
2361          * The (vfs) caller must ensure that if src is a directory then
2362          * target_ip is either null or an empty directory.
2363          */
2364         for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
2365                 if (inodes[i] == du_wip.ip ||
2366                     (inodes[i] == target_ip &&
2367                      (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
2368                         struct xfs_perag        *pag;
2369                         struct xfs_buf          *bp;
2370 
2371                         pag = xfs_perag_get(mp,
2372                                         XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
2373                         error = xfs_read_agi(pag, tp, 0, &bp);
2374                         xfs_perag_put(pag);
2375                         if (error)
2376                                 goto out_trans_cancel;
2377                 }
2378         }
2379 
2380         error = xfs_dir_rename_children(tp, &du_src, &du_tgt, spaceres,
2381                         &du_wip);
2382         if (error)
2383                 goto out_trans_cancel;
2384 
2385         if (du_wip.ip) {
2386                 /*
2387                  * Now we have a real link, clear the "I'm a tmpfile" state
2388                  * flag from the inode so it doesn't accidentally get misused in
2389                  * future.
2390                  */
2391                 VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
2392         }
2393 
2394 out_commit:
2395         /*
2396          * If this is a synchronous mount, make sure that the rename
2397          * transaction goes to disk before returning to the user.
2398          */
2399         if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
2400                 xfs_trans_set_sync(tp);
2401 
2402         error = xfs_trans_commit(tp);
2403         nospace_error = 0;
2404         goto out_unlock;
2405 
2406 out_trans_cancel:
2407         xfs_trans_cancel(tp);
2408 out_unlock:
2409         xfs_iunlock_rename(inodes, num_inodes);
2410 out_tgt_ppargs:
2411         xfs_parent_finish(mp, du_tgt.ppargs);
2412 out_wip_ppargs:
2413         xfs_parent_finish(mp, du_wip.ppargs);
2414 out_src_ppargs:
2415         xfs_parent_finish(mp, du_src.ppargs);
2416 out_release_wip:
2417         if (du_wip.ip)
2418                 xfs_irele(du_wip.ip);
2419         if (error == -ENOSPC && nospace_error)
2420                 error = nospace_error;
2421         return error;
2422 }
2423 
2424 static int
2425 xfs_iflush(
2426         struct xfs_inode        *ip,
2427         struct xfs_buf          *bp)
2428 {
2429         struct xfs_inode_log_item *iip = ip->i_itemp;
2430         struct xfs_dinode       *dip;
2431         struct xfs_mount        *mp = ip->i_mount;
2432         int                     error;
2433 
2434         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
2435         ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
2436         ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
2437                ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2438         ASSERT(iip->ili_item.li_buf == bp);
2439 
2440         dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
2441 
2442         /*
2443          * We don't flush the inode if any of the following checks fail, but we
2444          * do still update the log item and attach to the backing buffer as if
2445          * the flush happened. This is a formality to facilitate predictable
2446          * error handling as the caller will shutdown and fail the buffer.
2447          */
2448         error = -EFSCORRUPTED;
2449         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2450                                mp, XFS_ERRTAG_IFLUSH_1)) {
2451                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2452                         "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
2453                         __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2454                 goto flush_out;
2455         }
2456         if (S_ISREG(VFS_I(ip)->i_mode)) {
2457                 if (XFS_TEST_ERROR(
2458                     ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
2459                     ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
2460                     mp, XFS_ERRTAG_IFLUSH_3)) {
2461                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2462                                 "%s: Bad regular inode %llu, ptr "PTR_FMT,
2463                                 __func__, ip->i_ino, ip);
2464                         goto flush_out;
2465                 }
2466         } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
2467                 if (XFS_TEST_ERROR(
2468                     ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
2469                     ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
2470                     ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
2471                     mp, XFS_ERRTAG_IFLUSH_4)) {
2472                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2473                                 "%s: Bad directory inode %llu, ptr "PTR_FMT,
2474                                 __func__, ip->i_ino, ip);
2475                         goto flush_out;
2476                 }
2477         }
2478         if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
2479                                 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
2480                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2481                         "%s: detected corrupt incore inode %llu, "
2482                         "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
2483                         __func__, ip->i_ino,
2484                         ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
2485                         ip->i_nblocks, ip);
2486                 goto flush_out;
2487         }
2488         if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
2489                                 mp, XFS_ERRTAG_IFLUSH_6)) {
2490                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2491                         "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
2492                         __func__, ip->i_ino, ip->i_forkoff, ip);
2493                 goto flush_out;
2494         }
2495 
2496         /*
2497          * Inode item log recovery for v2 inodes are dependent on the flushiter
2498          * count for correct sequencing.  We bump the flush iteration count so
2499          * we can detect flushes which postdate a log record during recovery.
2500          * This is redundant as we now log every change and hence this can't
2501          * happen but we need to still do it to ensure backwards compatibility
2502          * with old kernels that predate logging all inode changes.
2503          */
2504         if (!xfs_has_v3inodes(mp))
2505                 ip->i_flushiter++;
2506 
2507         /*
2508          * If there are inline format data / attr forks attached to this inode,
2509          * make sure they are not corrupt.
2510          */
2511         if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
2512             xfs_ifork_verify_local_data(ip))
2513                 goto flush_out;
2514         if (xfs_inode_has_attr_fork(ip) &&
2515             ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
2516             xfs_ifork_verify_local_attr(ip))
2517                 goto flush_out;
2518 
2519         /*
2520          * Copy the dirty parts of the inode into the on-disk inode.  We always
2521          * copy out the core of the inode, because if the inode is dirty at all
2522          * the core must be.
2523          */
2524         xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
2525 
2526         /* Wrap, we never let the log put out DI_MAX_FLUSH */
2527         if (!xfs_has_v3inodes(mp)) {
2528                 if (ip->i_flushiter == DI_MAX_FLUSH)
2529                         ip->i_flushiter = 0;
2530         }
2531 
2532         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
2533         if (xfs_inode_has_attr_fork(ip))
2534                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
2535 
2536         /*
2537          * We've recorded everything logged in the inode, so we'd like to clear
2538          * the ili_fields bits so we don't log and flush things unnecessarily.
2539          * However, we can't stop logging all this information until the data
2540          * we've copied into the disk buffer is written to disk.  If we did we
2541          * might overwrite the copy of the inode in the log with all the data
2542          * after re-logging only part of it, and in the face of a crash we
2543          * wouldn't have all the data we need to recover.
2544          *
2545          * What we do is move the bits to the ili_last_fields field.  When
2546          * logging the inode, these bits are moved back to the ili_fields field.
2547          * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
2548          * we know that the information those bits represent is permanently on
2549          * disk.  As long as the flush completes before the inode is logged
2550          * again, then both ili_fields and ili_last_fields will be cleared.
2551          */
2552         error = 0;
2553 flush_out:
2554         spin_lock(&iip->ili_lock);
2555         iip->ili_last_fields = iip->ili_fields;
2556         iip->ili_fields = 0;
2557         iip->ili_fsync_fields = 0;
2558         set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
2559         spin_unlock(&iip->ili_lock);
2560 
2561         /*
2562          * Store the current LSN of the inode so that we can tell whether the
2563          * item has moved in the AIL from xfs_buf_inode_iodone().
2564          */
2565         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2566                                 &iip->ili_item.li_lsn);
2567 
2568         /* generate the checksum. */
2569         xfs_dinode_calc_crc(mp, dip);
2570         if (error)
2571                 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
2572         return error;
2573 }
2574 
2575 /*
2576  * Non-blocking flush of dirty inode metadata into the backing buffer.
2577  *
2578  * The caller must have a reference to the inode and hold the cluster buffer
2579  * locked. The function will walk across all the inodes on the cluster buffer it
2580  * can find and lock without blocking, and flush them to the cluster buffer.
2581  *
2582  * On successful flushing of at least one inode, the caller must write out the
2583  * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
2584  * the caller needs to release the buffer. On failure, the filesystem will be
2585  * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
2586  * will be returned.
2587  */
2588 int
2589 xfs_iflush_cluster(
2590         struct xfs_buf          *bp)
2591 {
2592         struct xfs_mount        *mp = bp->b_mount;
2593         struct xfs_log_item     *lip, *n;
2594         struct xfs_inode        *ip;
2595         struct xfs_inode_log_item *iip;
2596         int                     clcount = 0;
2597         int                     error = 0;
2598 
2599         /*
2600          * We must use the safe variant here as on shutdown xfs_iflush_abort()
2601          * will remove itself from the list.
2602          */
2603         list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
2604                 iip = (struct xfs_inode_log_item *)lip;
2605                 ip = iip->ili_inode;
2606 
2607                 /*
2608                  * Quick and dirty check to avoid locks if possible.
2609                  */
2610                 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
2611                         continue;
2612                 if (xfs_ipincount(ip))
2613                         continue;
2614 
2615                 /*
2616                  * The inode is still attached to the buffer, which means it is
2617                  * dirty but reclaim might try to grab it. Check carefully for
2618                  * that, and grab the ilock while still holding the i_flags_lock
2619                  * to guarantee reclaim will not be able to reclaim this inode
2620                  * once we drop the i_flags_lock.
2621                  */
2622                 spin_lock(&ip->i_flags_lock);
2623                 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
2624                 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
2625                         spin_unlock(&ip->i_flags_lock);
2626                         continue;
2627                 }
2628 
2629                 /*
2630                  * ILOCK will pin the inode against reclaim and prevent
2631                  * concurrent transactions modifying the inode while we are
2632                  * flushing the inode. If we get the lock, set the flushing
2633                  * state before we drop the i_flags_lock.
2634                  */
2635                 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
2636                         spin_unlock(&ip->i_flags_lock);
2637                         continue;
2638                 }
2639                 __xfs_iflags_set(ip, XFS_IFLUSHING);
2640                 spin_unlock(&ip->i_flags_lock);
2641 
2642                 /*
2643                  * Abort flushing this inode if we are shut down because the
2644                  * inode may not currently be in the AIL. This can occur when
2645                  * log I/O failure unpins the inode without inserting into the
2646                  * AIL, leaving a dirty/unpinned inode attached to the buffer
2647                  * that otherwise looks like it should be flushed.
2648                  */
2649                 if (xlog_is_shutdown(mp->m_log)) {
2650                         xfs_iunpin_wait(ip);
2651                         xfs_iflush_abort(ip);
2652                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2653                         error = -EIO;
2654                         continue;
2655                 }
2656 
2657                 /* don't block waiting on a log force to unpin dirty inodes */
2658                 if (xfs_ipincount(ip)) {
2659                         xfs_iflags_clear(ip, XFS_IFLUSHING);
2660                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2661                         continue;
2662                 }
2663 
2664                 if (!xfs_inode_clean(ip))
2665                         error = xfs_iflush(ip, bp);
2666                 else
2667                         xfs_iflags_clear(ip, XFS_IFLUSHING);
2668                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2669                 if (error)
2670                         break;
2671                 clcount++;
2672         }
2673 
2674         if (error) {
2675                 /*
2676                  * Shutdown first so we kill the log before we release this
2677                  * buffer. If it is an INODE_ALLOC buffer and pins the tail
2678                  * of the log, failing it before the _log_ is shut down can
2679                  * result in the log tail being moved forward in the journal
2680                  * on disk because log writes can still be taking place. Hence
2681                  * unpinning the tail will allow the ICREATE intent to be
2682                  * removed from the log an recovery will fail with uninitialised
2683                  * inode cluster buffers.
2684                  */
2685                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2686                 bp->b_flags |= XBF_ASYNC;
2687                 xfs_buf_ioend_fail(bp);
2688                 return error;
2689         }
2690 
2691         if (!clcount)
2692                 return -EAGAIN;
2693 
2694         XFS_STATS_INC(mp, xs_icluster_flushcnt);
2695         XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
2696         return 0;
2697 
2698 }
2699 
2700 /* Release an inode. */
2701 void
2702 xfs_irele(
2703         struct xfs_inode        *ip)
2704 {
2705         trace_xfs_irele(ip, _RET_IP_);
2706         iput(VFS_I(ip));
2707 }
2708 
2709 /*
2710  * Ensure all commited transactions touching the inode are written to the log.
2711  */
2712 int
2713 xfs_log_force_inode(
2714         struct xfs_inode        *ip)
2715 {
2716         xfs_csn_t               seq = 0;
2717 
2718         xfs_ilock(ip, XFS_ILOCK_SHARED);
2719         if (xfs_ipincount(ip))
2720                 seq = ip->i_itemp->ili_commit_seq;
2721         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2722 
2723         if (!seq)
2724                 return 0;
2725         return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
2726 }
2727 
2728 /*
2729  * Grab the exclusive iolock for a data copy from src to dest, making sure to
2730  * abide vfs locking order (lowest pointer value goes first) and breaking the
2731  * layout leases before proceeding.  The loop is needed because we cannot call
2732  * the blocking break_layout() with the iolocks held, and therefore have to
2733  * back out both locks.
2734  */
2735 static int
2736 xfs_iolock_two_inodes_and_break_layout(
2737         struct inode            *src,
2738         struct inode            *dest)
2739 {
2740         int                     error;
2741 
2742         if (src > dest)
2743                 swap(src, dest);
2744 
2745 retry:
2746         /* Wait to break both inodes' layouts before we start locking. */
2747         error = break_layout(src, true);
2748         if (error)
2749                 return error;
2750         if (src != dest) {
2751                 error = break_layout(dest, true);
2752                 if (error)
2753                         return error;
2754         }
2755 
2756         /* Lock one inode and make sure nobody got in and leased it. */
2757         inode_lock(src);
2758         error = break_layout(src, false);
2759         if (error) {
2760                 inode_unlock(src);
2761                 if (error == -EWOULDBLOCK)
2762                         goto retry;
2763                 return error;
2764         }
2765 
2766         if (src == dest)
2767                 return 0;
2768 
2769         /* Lock the other inode and make sure nobody got in and leased it. */
2770         inode_lock_nested(dest, I_MUTEX_NONDIR2);
2771         error = break_layout(dest, false);
2772         if (error) {
2773                 inode_unlock(src);
2774                 inode_unlock(dest);
2775                 if (error == -EWOULDBLOCK)
2776                         goto retry;
2777                 return error;
2778         }
2779 
2780         return 0;
2781 }
2782 
2783 static int
2784 xfs_mmaplock_two_inodes_and_break_dax_layout(
2785         struct xfs_inode        *ip1,
2786         struct xfs_inode        *ip2)
2787 {
2788         int                     error;
2789         bool                    retry;
2790         struct page             *page;
2791 
2792         if (ip1->i_ino > ip2->i_ino)
2793                 swap(ip1, ip2);
2794 
2795 again:
2796         retry = false;
2797         /* Lock the first inode */
2798         xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
2799         error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
2800         if (error || retry) {
2801                 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
2802                 if (error == 0 && retry)
2803                         goto again;
2804                 return error;
2805         }
2806 
2807         if (ip1 == ip2)
2808                 return 0;
2809 
2810         /* Nested lock the second inode */
2811         xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
2812         /*
2813          * We cannot use xfs_break_dax_layouts() directly here because it may
2814          * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
2815          * for this nested lock case.
2816          */
2817         page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
2818         if (page && page_ref_count(page) != 1) {
2819                 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
2820                 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
2821                 goto again;
2822         }
2823 
2824         return 0;
2825 }
2826 
2827 /*
2828  * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
2829  * mmap activity.
2830  */
2831 int
2832 xfs_ilock2_io_mmap(
2833         struct xfs_inode        *ip1,
2834         struct xfs_inode        *ip2)
2835 {
2836         int                     ret;
2837 
2838         ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
2839         if (ret)
2840                 return ret;
2841 
2842         if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
2843                 ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
2844                 if (ret) {
2845                         inode_unlock(VFS_I(ip2));
2846                         if (ip1 != ip2)
2847                                 inode_unlock(VFS_I(ip1));
2848                         return ret;
2849                 }
2850         } else
2851                 filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
2852                                             VFS_I(ip2)->i_mapping);
2853 
2854         return 0;
2855 }
2856 
2857 /* Unlock both inodes to allow IO and mmap activity. */
2858 void
2859 xfs_iunlock2_io_mmap(
2860         struct xfs_inode        *ip1,
2861         struct xfs_inode        *ip2)
2862 {
2863         if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
2864                 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
2865                 if (ip1 != ip2)
2866                         xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
2867         } else
2868                 filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
2869                                               VFS_I(ip2)->i_mapping);
2870 
2871         inode_unlock(VFS_I(ip2));
2872         if (ip1 != ip2)
2873                 inode_unlock(VFS_I(ip1));
2874 }
2875 
2876 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
2877 void
2878 xfs_iunlock2_remapping(
2879         struct xfs_inode        *ip1,
2880         struct xfs_inode        *ip2)
2881 {
2882         xfs_iflags_clear(ip1, XFS_IREMAPPING);
2883 
2884         if (ip1 != ip2)
2885                 xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
2886         xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
2887 
2888         if (ip1 != ip2)
2889                 inode_unlock_shared(VFS_I(ip1));
2890         inode_unlock(VFS_I(ip2));
2891 }
2892 
2893 /*
2894  * Reload the incore inode list for this inode.  Caller should ensure that
2895  * the link count cannot change, either by taking ILOCK_SHARED or otherwise
2896  * preventing other threads from executing.
2897  */
2898 int
2899 xfs_inode_reload_unlinked_bucket(
2900         struct xfs_trans        *tp,
2901         struct xfs_inode        *ip)
2902 {
2903         struct xfs_mount        *mp = tp->t_mountp;
2904         struct xfs_buf          *agibp;
2905         struct xfs_agi          *agi;
2906         struct xfs_perag        *pag;
2907         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2908         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2909         xfs_agino_t             prev_agino, next_agino;
2910         unsigned int            bucket;
2911         bool                    foundit = false;
2912         int                     error;
2913 
2914         /* Grab the first inode in the list */
2915         pag = xfs_perag_get(mp, agno);
2916         error = xfs_ialloc_read_agi(pag, tp, 0, &agibp);
2917         xfs_perag_put(pag);
2918         if (error)
2919                 return error;
2920 
2921         /*
2922          * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
2923          * incore unlinked list pointers for this inode.  Check once more to
2924          * see if we raced with anyone else to reload the unlinked list.
2925          */
2926         if (!xfs_inode_unlinked_incomplete(ip)) {
2927                 foundit = true;
2928                 goto out_agibp;
2929         }
2930 
2931         bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
2932         agi = agibp->b_addr;
2933 
2934         trace_xfs_inode_reload_unlinked_bucket(ip);
2935 
2936         xfs_info_ratelimited(mp,
2937  "Found unrecovered unlinked inode 0x%x in AG 0x%x.  Initiating list recovery.",
2938                         agino, agno);
2939 
2940         prev_agino = NULLAGINO;
2941         next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
2942         while (next_agino != NULLAGINO) {
2943                 struct xfs_inode        *next_ip = NULL;
2944 
2945                 /* Found this caller's inode, set its backlink. */
2946                 if (next_agino == agino) {
2947                         next_ip = ip;
2948                         next_ip->i_prev_unlinked = prev_agino;
2949                         foundit = true;
2950                         goto next_inode;
2951                 }
2952 
2953                 /* Try in-memory lookup first. */
2954                 next_ip = xfs_iunlink_lookup(pag, next_agino);
2955                 if (next_ip)
2956                         goto next_inode;
2957 
2958                 /* Inode not in memory, try reloading it. */
2959                 error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
2960                                 next_agino);
2961                 if (error)
2962                         break;
2963 
2964                 /* Grab the reloaded inode. */
2965                 next_ip = xfs_iunlink_lookup(pag, next_agino);
2966                 if (!next_ip) {
2967                         /* No incore inode at all?  We reloaded it... */
2968                         ASSERT(next_ip != NULL);
2969                         error = -EFSCORRUPTED;
2970                         break;
2971                 }
2972 
2973 next_inode:
2974                 prev_agino = next_agino;
2975                 next_agino = next_ip->i_next_unlinked;
2976         }
2977 
2978 out_agibp:
2979         xfs_trans_brelse(tp, agibp);
2980         /* Should have found this inode somewhere in the iunlinked bucket. */
2981         if (!error && !foundit)
2982                 error = -EFSCORRUPTED;
2983         return error;
2984 }
2985 
2986 /* Decide if this inode is missing its unlinked list and reload it. */
2987 int
2988 xfs_inode_reload_unlinked(
2989         struct xfs_inode        *ip)
2990 {
2991         struct xfs_trans        *tp;
2992         int                     error;
2993 
2994         error = xfs_trans_alloc_empty(ip->i_mount, &tp);
2995         if (error)
2996                 return error;
2997 
2998         xfs_ilock(ip, XFS_ILOCK_SHARED);
2999         if (xfs_inode_unlinked_incomplete(ip))
3000                 error = xfs_inode_reload_unlinked_bucket(tp, ip);
3001         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3002         xfs_trans_cancel(tp);
3003 
3004         return error;
3005 }
3006 
3007 /* Has this inode fork been zapped by repair? */
3008 bool
3009 xfs_ifork_zapped(
3010         const struct xfs_inode  *ip,
3011         int                     whichfork)
3012 {
3013         unsigned int            datamask = 0;
3014 
3015         switch (whichfork) {
3016         case XFS_DATA_FORK:
3017                 switch (ip->i_vnode.i_mode & S_IFMT) {
3018                 case S_IFDIR:
3019                         datamask = XFS_SICK_INO_DIR_ZAPPED;
3020                         break;
3021                 case S_IFLNK:
3022                         datamask = XFS_SICK_INO_SYMLINK_ZAPPED;
3023                         break;
3024                 }
3025                 return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask);
3026         case XFS_ATTR_FORK:
3027                 return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED;
3028         default:
3029                 return false;
3030         }
3031 }
3032 
3033 /* Compute the number of data and realtime blocks used by a file. */
3034 void
3035 xfs_inode_count_blocks(
3036         struct xfs_trans        *tp,
3037         struct xfs_inode        *ip,
3038         xfs_filblks_t           *dblocks,
3039         xfs_filblks_t           *rblocks)
3040 {
3041         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
3042 
3043         *rblocks = 0;
3044         if (XFS_IS_REALTIME_INODE(ip))
3045                 xfs_bmap_count_leaves(ifp, rblocks);
3046         *dblocks = ip->i_nblocks - *rblocks;
3047 }
3048 
3049 static void
3050 xfs_wait_dax_page(
3051         struct inode            *inode)
3052 {
3053         struct xfs_inode        *ip = XFS_I(inode);
3054 
3055         xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
3056         schedule();
3057         xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
3058 }
3059 
3060 int
3061 xfs_break_dax_layouts(
3062         struct inode            *inode,
3063         bool                    *retry)
3064 {
3065         struct page             *page;
3066 
3067         xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
3068 
3069         page = dax_layout_busy_page(inode->i_mapping);
3070         if (!page)
3071                 return 0;
3072 
3073         *retry = true;
3074         return ___wait_var_event(&page->_refcount,
3075                         atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
3076                         0, 0, xfs_wait_dax_page(inode));
3077 }
3078 
3079 int
3080 xfs_break_layouts(
3081         struct inode            *inode,
3082         uint                    *iolock,
3083         enum layout_break_reason reason)
3084 {
3085         bool                    retry;
3086         int                     error;
3087 
3088         xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
3089 
3090         do {
3091                 retry = false;
3092                 switch (reason) {
3093                 case BREAK_UNMAP:
3094                         error = xfs_break_dax_layouts(inode, &retry);
3095                         if (error || retry)
3096                                 break;
3097                         fallthrough;
3098                 case BREAK_WRITE:
3099                         error = xfs_break_leased_layouts(inode, iolock, &retry);
3100                         break;
3101                 default:
3102                         WARN_ON_ONCE(1);
3103                         error = -EINVAL;
3104                 }
3105         } while (error == 0 && retry);
3106 
3107         return error;
3108 }
3109 
3110 /* Returns the size of fundamental allocation unit for a file, in bytes. */
3111 unsigned int
3112 xfs_inode_alloc_unitsize(
3113         struct xfs_inode        *ip)
3114 {
3115         unsigned int            blocks = 1;
3116 
3117         if (XFS_IS_REALTIME_INODE(ip))
3118                 blocks = ip->i_mount->m_sb.sb_rextsize;
3119 
3120         return XFS_FSB_TO_B(ip->i_mount, blocks);
3121 }
3122 
3123 /* Should we always be using copy on write for file writes? */
3124 bool
3125 xfs_is_always_cow_inode(
3126         struct xfs_inode        *ip)
3127 {
3128         return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
3129 }
3130 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php