~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/ext4/extents.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
  4  * Written by Alex Tomas <alex@clusterfs.com>
  5  *
  6  * Architecture independence:
  7  *   Copyright (c) 2005, Bull S.A.
  8  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
  9  */
 10 
 11 /*
 12  * Extents support for EXT4
 13  *
 14  * TODO:
 15  *   - ext4*_error() should be used in some situations
 16  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
 17  *   - smart tree reduction
 18  */
 19 
 20 #include <linux/fs.h>
 21 #include <linux/time.h>
 22 #include <linux/jbd2.h>
 23 #include <linux/highuid.h>
 24 #include <linux/pagemap.h>
 25 #include <linux/quotaops.h>
 26 #include <linux/string.h>
 27 #include <linux/slab.h>
 28 #include <linux/uaccess.h>
 29 #include <linux/fiemap.h>
 30 #include <linux/iomap.h>
 31 #include <linux/sched/mm.h>
 32 #include "ext4_jbd2.h"
 33 #include "ext4_extents.h"
 34 #include "xattr.h"
 35 
 36 #include <trace/events/ext4.h>
 37 
 38 /*
 39  * used by extent splitting.
 40  */
 41 #define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
 42                                         due to ENOSPC */
 43 #define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
 44 #define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
 45 
 46 #define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
 47 #define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
 48 
 49 static __le32 ext4_extent_block_csum(struct inode *inode,
 50                                      struct ext4_extent_header *eh)
 51 {
 52         struct ext4_inode_info *ei = EXT4_I(inode);
 53         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 54         __u32 csum;
 55 
 56         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
 57                            EXT4_EXTENT_TAIL_OFFSET(eh));
 58         return cpu_to_le32(csum);
 59 }
 60 
 61 static int ext4_extent_block_csum_verify(struct inode *inode,
 62                                          struct ext4_extent_header *eh)
 63 {
 64         struct ext4_extent_tail *et;
 65 
 66         if (!ext4_has_metadata_csum(inode->i_sb))
 67                 return 1;
 68 
 69         et = find_ext4_extent_tail(eh);
 70         if (et->et_checksum != ext4_extent_block_csum(inode, eh))
 71                 return 0;
 72         return 1;
 73 }
 74 
 75 static void ext4_extent_block_csum_set(struct inode *inode,
 76                                        struct ext4_extent_header *eh)
 77 {
 78         struct ext4_extent_tail *et;
 79 
 80         if (!ext4_has_metadata_csum(inode->i_sb))
 81                 return;
 82 
 83         et = find_ext4_extent_tail(eh);
 84         et->et_checksum = ext4_extent_block_csum(inode, eh);
 85 }
 86 
 87 static int ext4_split_extent_at(handle_t *handle,
 88                              struct inode *inode,
 89                              struct ext4_ext_path **ppath,
 90                              ext4_lblk_t split,
 91                              int split_flag,
 92                              int flags);
 93 
 94 static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
 95 {
 96         /*
 97          * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 98          * moment, get_block can be called only for blocks inside i_size since
 99          * page cache has been already dropped and writes are blocked by
100          * i_rwsem. So we can safely drop the i_data_sem here.
101          */
102         BUG_ON(EXT4_JOURNAL(inode) == NULL);
103         ext4_discard_preallocations(inode);
104         up_write(&EXT4_I(inode)->i_data_sem);
105         *dropped = 1;
106         return 0;
107 }
108 
109 static void ext4_ext_drop_refs(struct ext4_ext_path *path)
110 {
111         int depth, i;
112 
113         if (!path)
114                 return;
115         depth = path->p_depth;
116         for (i = 0; i <= depth; i++, path++) {
117                 brelse(path->p_bh);
118                 path->p_bh = NULL;
119         }
120 }
121 
122 void ext4_free_ext_path(struct ext4_ext_path *path)
123 {
124         ext4_ext_drop_refs(path);
125         kfree(path);
126 }
127 
128 /*
129  * Make sure 'handle' has at least 'check_cred' credits. If not, restart
130  * transaction with 'restart_cred' credits. The function drops i_data_sem
131  * when restarting transaction and gets it after transaction is restarted.
132  *
133  * The function returns 0 on success, 1 if transaction had to be restarted,
134  * and < 0 in case of fatal error.
135  */
136 int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
137                                 int check_cred, int restart_cred,
138                                 int revoke_cred)
139 {
140         int ret;
141         int dropped = 0;
142 
143         ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
144                 revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
145         if (dropped)
146                 down_write(&EXT4_I(inode)->i_data_sem);
147         return ret;
148 }
149 
150 /*
151  * could return:
152  *  - EROFS
153  *  - ENOMEM
154  */
155 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
156                                 struct ext4_ext_path *path)
157 {
158         int err = 0;
159 
160         if (path->p_bh) {
161                 /* path points to block */
162                 BUFFER_TRACE(path->p_bh, "get_write_access");
163                 err = ext4_journal_get_write_access(handle, inode->i_sb,
164                                                     path->p_bh, EXT4_JTR_NONE);
165                 /*
166                  * The extent buffer's verified bit will be set again in
167                  * __ext4_ext_dirty(). We could leave an inconsistent
168                  * buffer if the extents updating procudure break off du
169                  * to some error happens, force to check it again.
170                  */
171                 if (!err)
172                         clear_buffer_verified(path->p_bh);
173         }
174         /* path points to leaf/index in inode body */
175         /* we use in-core data, no need to protect them */
176         return err;
177 }
178 
179 /*
180  * could return:
181  *  - EROFS
182  *  - ENOMEM
183  *  - EIO
184  */
185 static int __ext4_ext_dirty(const char *where, unsigned int line,
186                             handle_t *handle, struct inode *inode,
187                             struct ext4_ext_path *path)
188 {
189         int err;
190 
191         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
192         if (path->p_bh) {
193                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
194                 /* path points to block */
195                 err = __ext4_handle_dirty_metadata(where, line, handle,
196                                                    inode, path->p_bh);
197                 /* Extents updating done, re-set verified flag */
198                 if (!err)
199                         set_buffer_verified(path->p_bh);
200         } else {
201                 /* path points to leaf/index in inode body */
202                 err = ext4_mark_inode_dirty(handle, inode);
203         }
204         return err;
205 }
206 
207 #define ext4_ext_dirty(handle, inode, path) \
208                 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
209 
210 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
211                               struct ext4_ext_path *path,
212                               ext4_lblk_t block)
213 {
214         if (path) {
215                 int depth = path->p_depth;
216                 struct ext4_extent *ex;
217 
218                 /*
219                  * Try to predict block placement assuming that we are
220                  * filling in a file which will eventually be
221                  * non-sparse --- i.e., in the case of libbfd writing
222                  * an ELF object sections out-of-order but in a way
223                  * the eventually results in a contiguous object or
224                  * executable file, or some database extending a table
225                  * space file.  However, this is actually somewhat
226                  * non-ideal if we are writing a sparse file such as
227                  * qemu or KVM writing a raw image file that is going
228                  * to stay fairly sparse, since it will end up
229                  * fragmenting the file system's free space.  Maybe we
230                  * should have some hueristics or some way to allow
231                  * userspace to pass a hint to file system,
232                  * especially if the latter case turns out to be
233                  * common.
234                  */
235                 ex = path[depth].p_ext;
236                 if (ex) {
237                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
238                         ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
239 
240                         if (block > ext_block)
241                                 return ext_pblk + (block - ext_block);
242                         else
243                                 return ext_pblk - (ext_block - block);
244                 }
245 
246                 /* it looks like index is empty;
247                  * try to find starting block from index itself */
248                 if (path[depth].p_bh)
249                         return path[depth].p_bh->b_blocknr;
250         }
251 
252         /* OK. use inode's group */
253         return ext4_inode_to_goal_block(inode);
254 }
255 
256 /*
257  * Allocation for a meta data block
258  */
259 static ext4_fsblk_t
260 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
261                         struct ext4_ext_path *path,
262                         struct ext4_extent *ex, int *err, unsigned int flags)
263 {
264         ext4_fsblk_t goal, newblock;
265 
266         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
267         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
268                                         NULL, err);
269         return newblock;
270 }
271 
272 static inline int ext4_ext_space_block(struct inode *inode, int check)
273 {
274         int size;
275 
276         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
277                         / sizeof(struct ext4_extent);
278 #ifdef AGGRESSIVE_TEST
279         if (!check && size > 6)
280                 size = 6;
281 #endif
282         return size;
283 }
284 
285 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
286 {
287         int size;
288 
289         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
290                         / sizeof(struct ext4_extent_idx);
291 #ifdef AGGRESSIVE_TEST
292         if (!check && size > 5)
293                 size = 5;
294 #endif
295         return size;
296 }
297 
298 static inline int ext4_ext_space_root(struct inode *inode, int check)
299 {
300         int size;
301 
302         size = sizeof(EXT4_I(inode)->i_data);
303         size -= sizeof(struct ext4_extent_header);
304         size /= sizeof(struct ext4_extent);
305 #ifdef AGGRESSIVE_TEST
306         if (!check && size > 3)
307                 size = 3;
308 #endif
309         return size;
310 }
311 
312 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
313 {
314         int size;
315 
316         size = sizeof(EXT4_I(inode)->i_data);
317         size -= sizeof(struct ext4_extent_header);
318         size /= sizeof(struct ext4_extent_idx);
319 #ifdef AGGRESSIVE_TEST
320         if (!check && size > 4)
321                 size = 4;
322 #endif
323         return size;
324 }
325 
326 static inline int
327 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
328                            struct ext4_ext_path **ppath, ext4_lblk_t lblk,
329                            int nofail)
330 {
331         struct ext4_ext_path *path = *ppath;
332         int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
333         int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
334 
335         if (nofail)
336                 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
337 
338         return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
339                         EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
340                         flags);
341 }
342 
343 static int
344 ext4_ext_max_entries(struct inode *inode, int depth)
345 {
346         int max;
347 
348         if (depth == ext_depth(inode)) {
349                 if (depth == 0)
350                         max = ext4_ext_space_root(inode, 1);
351                 else
352                         max = ext4_ext_space_root_idx(inode, 1);
353         } else {
354                 if (depth == 0)
355                         max = ext4_ext_space_block(inode, 1);
356                 else
357                         max = ext4_ext_space_block_idx(inode, 1);
358         }
359 
360         return max;
361 }
362 
363 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
364 {
365         ext4_fsblk_t block = ext4_ext_pblock(ext);
366         int len = ext4_ext_get_actual_len(ext);
367         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
368 
369         /*
370          * We allow neither:
371          *  - zero length
372          *  - overflow/wrap-around
373          */
374         if (lblock + len <= lblock)
375                 return 0;
376         return ext4_inode_block_valid(inode, block, len);
377 }
378 
379 static int ext4_valid_extent_idx(struct inode *inode,
380                                 struct ext4_extent_idx *ext_idx)
381 {
382         ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
383 
384         return ext4_inode_block_valid(inode, block, 1);
385 }
386 
387 static int ext4_valid_extent_entries(struct inode *inode,
388                                      struct ext4_extent_header *eh,
389                                      ext4_lblk_t lblk, ext4_fsblk_t *pblk,
390                                      int depth)
391 {
392         unsigned short entries;
393         ext4_lblk_t lblock = 0;
394         ext4_lblk_t cur = 0;
395 
396         if (eh->eh_entries == 0)
397                 return 1;
398 
399         entries = le16_to_cpu(eh->eh_entries);
400 
401         if (depth == 0) {
402                 /* leaf entries */
403                 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
404 
405                 /*
406                  * The logical block in the first entry should equal to
407                  * the number in the index block.
408                  */
409                 if (depth != ext_depth(inode) &&
410                     lblk != le32_to_cpu(ext->ee_block))
411                         return 0;
412                 while (entries) {
413                         if (!ext4_valid_extent(inode, ext))
414                                 return 0;
415 
416                         /* Check for overlapping extents */
417                         lblock = le32_to_cpu(ext->ee_block);
418                         if (lblock < cur) {
419                                 *pblk = ext4_ext_pblock(ext);
420                                 return 0;
421                         }
422                         cur = lblock + ext4_ext_get_actual_len(ext);
423                         ext++;
424                         entries--;
425                 }
426         } else {
427                 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
428 
429                 /*
430                  * The logical block in the first entry should equal to
431                  * the number in the parent index block.
432                  */
433                 if (depth != ext_depth(inode) &&
434                     lblk != le32_to_cpu(ext_idx->ei_block))
435                         return 0;
436                 while (entries) {
437                         if (!ext4_valid_extent_idx(inode, ext_idx))
438                                 return 0;
439 
440                         /* Check for overlapping index extents */
441                         lblock = le32_to_cpu(ext_idx->ei_block);
442                         if (lblock < cur) {
443                                 *pblk = ext4_idx_pblock(ext_idx);
444                                 return 0;
445                         }
446                         ext_idx++;
447                         entries--;
448                         cur = lblock + 1;
449                 }
450         }
451         return 1;
452 }
453 
454 static int __ext4_ext_check(const char *function, unsigned int line,
455                             struct inode *inode, struct ext4_extent_header *eh,
456                             int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
457 {
458         const char *error_msg;
459         int max = 0, err = -EFSCORRUPTED;
460 
461         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
462                 error_msg = "invalid magic";
463                 goto corrupted;
464         }
465         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
466                 error_msg = "unexpected eh_depth";
467                 goto corrupted;
468         }
469         if (unlikely(eh->eh_max == 0)) {
470                 error_msg = "invalid eh_max";
471                 goto corrupted;
472         }
473         max = ext4_ext_max_entries(inode, depth);
474         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
475                 error_msg = "too large eh_max";
476                 goto corrupted;
477         }
478         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
479                 error_msg = "invalid eh_entries";
480                 goto corrupted;
481         }
482         if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
483                 error_msg = "eh_entries is 0 but eh_depth is > 0";
484                 goto corrupted;
485         }
486         if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
487                 error_msg = "invalid extent entries";
488                 goto corrupted;
489         }
490         if (unlikely(depth > 32)) {
491                 error_msg = "too large eh_depth";
492                 goto corrupted;
493         }
494         /* Verify checksum on non-root extent tree nodes */
495         if (ext_depth(inode) != depth &&
496             !ext4_extent_block_csum_verify(inode, eh)) {
497                 error_msg = "extent tree corrupted";
498                 err = -EFSBADCRC;
499                 goto corrupted;
500         }
501         return 0;
502 
503 corrupted:
504         ext4_error_inode_err(inode, function, line, 0, -err,
505                              "pblk %llu bad header/extent: %s - magic %x, "
506                              "entries %u, max %u(%u), depth %u(%u)",
507                              (unsigned long long) pblk, error_msg,
508                              le16_to_cpu(eh->eh_magic),
509                              le16_to_cpu(eh->eh_entries),
510                              le16_to_cpu(eh->eh_max),
511                              max, le16_to_cpu(eh->eh_depth), depth);
512         return err;
513 }
514 
515 #define ext4_ext_check(inode, eh, depth, pblk)                  \
516         __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
517 
518 int ext4_ext_check_inode(struct inode *inode)
519 {
520         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
521 }
522 
523 static void ext4_cache_extents(struct inode *inode,
524                                struct ext4_extent_header *eh)
525 {
526         struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
527         ext4_lblk_t prev = 0;
528         int i;
529 
530         for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
531                 unsigned int status = EXTENT_STATUS_WRITTEN;
532                 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
533                 int len = ext4_ext_get_actual_len(ex);
534 
535                 if (prev && (prev != lblk))
536                         ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
537                                              EXTENT_STATUS_HOLE);
538 
539                 if (ext4_ext_is_unwritten(ex))
540                         status = EXTENT_STATUS_UNWRITTEN;
541                 ext4_es_cache_extent(inode, lblk, len,
542                                      ext4_ext_pblock(ex), status);
543                 prev = lblk + len;
544         }
545 }
546 
547 static struct buffer_head *
548 __read_extent_tree_block(const char *function, unsigned int line,
549                          struct inode *inode, struct ext4_extent_idx *idx,
550                          int depth, int flags)
551 {
552         struct buffer_head              *bh;
553         int                             err;
554         gfp_t                           gfp_flags = __GFP_MOVABLE | GFP_NOFS;
555         ext4_fsblk_t                    pblk;
556 
557         if (flags & EXT4_EX_NOFAIL)
558                 gfp_flags |= __GFP_NOFAIL;
559 
560         pblk = ext4_idx_pblock(idx);
561         bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
562         if (unlikely(!bh))
563                 return ERR_PTR(-ENOMEM);
564 
565         if (!bh_uptodate_or_lock(bh)) {
566                 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
567                 err = ext4_read_bh(bh, 0, NULL);
568                 if (err < 0)
569                         goto errout;
570         }
571         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
572                 return bh;
573         err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
574                                depth, pblk, le32_to_cpu(idx->ei_block));
575         if (err)
576                 goto errout;
577         set_buffer_verified(bh);
578         /*
579          * If this is a leaf block, cache all of its entries
580          */
581         if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
582                 struct ext4_extent_header *eh = ext_block_hdr(bh);
583                 ext4_cache_extents(inode, eh);
584         }
585         return bh;
586 errout:
587         put_bh(bh);
588         return ERR_PTR(err);
589 
590 }
591 
592 #define read_extent_tree_block(inode, idx, depth, flags)                \
593         __read_extent_tree_block(__func__, __LINE__, (inode), (idx),    \
594                                  (depth), (flags))
595 
596 /*
597  * This function is called to cache a file's extent information in the
598  * extent status tree
599  */
600 int ext4_ext_precache(struct inode *inode)
601 {
602         struct ext4_inode_info *ei = EXT4_I(inode);
603         struct ext4_ext_path *path = NULL;
604         struct buffer_head *bh;
605         int i = 0, depth, ret = 0;
606 
607         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
608                 return 0;       /* not an extent-mapped inode */
609 
610         down_read(&ei->i_data_sem);
611         depth = ext_depth(inode);
612 
613         /* Don't cache anything if there are no external extent blocks */
614         if (!depth) {
615                 up_read(&ei->i_data_sem);
616                 return ret;
617         }
618 
619         path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
620                        GFP_NOFS);
621         if (path == NULL) {
622                 up_read(&ei->i_data_sem);
623                 return -ENOMEM;
624         }
625 
626         path[0].p_hdr = ext_inode_hdr(inode);
627         ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
628         if (ret)
629                 goto out;
630         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
631         while (i >= 0) {
632                 /*
633                  * If this is a leaf block or we've reached the end of
634                  * the index block, go up
635                  */
636                 if ((i == depth) ||
637                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
638                         brelse(path[i].p_bh);
639                         path[i].p_bh = NULL;
640                         i--;
641                         continue;
642                 }
643                 bh = read_extent_tree_block(inode, path[i].p_idx++,
644                                             depth - i - 1,
645                                             EXT4_EX_FORCE_CACHE);
646                 if (IS_ERR(bh)) {
647                         ret = PTR_ERR(bh);
648                         break;
649                 }
650                 i++;
651                 path[i].p_bh = bh;
652                 path[i].p_hdr = ext_block_hdr(bh);
653                 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
654         }
655         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
656 out:
657         up_read(&ei->i_data_sem);
658         ext4_free_ext_path(path);
659         return ret;
660 }
661 
662 #ifdef EXT_DEBUG
663 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
664 {
665         int k, l = path->p_depth;
666 
667         ext_debug(inode, "path:");
668         for (k = 0; k <= l; k++, path++) {
669                 if (path->p_idx) {
670                         ext_debug(inode, "  %d->%llu",
671                                   le32_to_cpu(path->p_idx->ei_block),
672                                   ext4_idx_pblock(path->p_idx));
673                 } else if (path->p_ext) {
674                         ext_debug(inode, "  %d:[%d]%d:%llu ",
675                                   le32_to_cpu(path->p_ext->ee_block),
676                                   ext4_ext_is_unwritten(path->p_ext),
677                                   ext4_ext_get_actual_len(path->p_ext),
678                                   ext4_ext_pblock(path->p_ext));
679                 } else
680                         ext_debug(inode, "  []");
681         }
682         ext_debug(inode, "\n");
683 }
684 
685 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
686 {
687         int depth = ext_depth(inode);
688         struct ext4_extent_header *eh;
689         struct ext4_extent *ex;
690         int i;
691 
692         if (!path)
693                 return;
694 
695         eh = path[depth].p_hdr;
696         ex = EXT_FIRST_EXTENT(eh);
697 
698         ext_debug(inode, "Displaying leaf extents\n");
699 
700         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
701                 ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
702                           ext4_ext_is_unwritten(ex),
703                           ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
704         }
705         ext_debug(inode, "\n");
706 }
707 
708 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
709                         ext4_fsblk_t newblock, int level)
710 {
711         int depth = ext_depth(inode);
712         struct ext4_extent *ex;
713 
714         if (depth != level) {
715                 struct ext4_extent_idx *idx;
716                 idx = path[level].p_idx;
717                 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
718                         ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
719                                   level, le32_to_cpu(idx->ei_block),
720                                   ext4_idx_pblock(idx), newblock);
721                         idx++;
722                 }
723 
724                 return;
725         }
726 
727         ex = path[depth].p_ext;
728         while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
729                 ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
730                                 le32_to_cpu(ex->ee_block),
731                                 ext4_ext_pblock(ex),
732                                 ext4_ext_is_unwritten(ex),
733                                 ext4_ext_get_actual_len(ex),
734                                 newblock);
735                 ex++;
736         }
737 }
738 
739 #else
740 #define ext4_ext_show_path(inode, path)
741 #define ext4_ext_show_leaf(inode, path)
742 #define ext4_ext_show_move(inode, path, newblock, level)
743 #endif
744 
745 /*
746  * ext4_ext_binsearch_idx:
747  * binary search for the closest index of the given block
748  * the header must be checked before calling this
749  */
750 static void
751 ext4_ext_binsearch_idx(struct inode *inode,
752                         struct ext4_ext_path *path, ext4_lblk_t block)
753 {
754         struct ext4_extent_header *eh = path->p_hdr;
755         struct ext4_extent_idx *r, *l, *m;
756 
757 
758         ext_debug(inode, "binsearch for %u(idx):  ", block);
759 
760         l = EXT_FIRST_INDEX(eh) + 1;
761         r = EXT_LAST_INDEX(eh);
762         while (l <= r) {
763                 m = l + (r - l) / 2;
764                 ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
765                           le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
766                           r, le32_to_cpu(r->ei_block));
767 
768                 if (block < le32_to_cpu(m->ei_block))
769                         r = m - 1;
770                 else
771                         l = m + 1;
772         }
773 
774         path->p_idx = l - 1;
775         ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
776                   ext4_idx_pblock(path->p_idx));
777 
778 #ifdef CHECK_BINSEARCH
779         {
780                 struct ext4_extent_idx *chix, *ix;
781                 int k;
782 
783                 chix = ix = EXT_FIRST_INDEX(eh);
784                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
785                         if (k != 0 && le32_to_cpu(ix->ei_block) <=
786                             le32_to_cpu(ix[-1].ei_block)) {
787                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
788                                        "first=0x%p\n", k,
789                                        ix, EXT_FIRST_INDEX(eh));
790                                 printk(KERN_DEBUG "%u <= %u\n",
791                                        le32_to_cpu(ix->ei_block),
792                                        le32_to_cpu(ix[-1].ei_block));
793                         }
794                         BUG_ON(k && le32_to_cpu(ix->ei_block)
795                                            <= le32_to_cpu(ix[-1].ei_block));
796                         if (block < le32_to_cpu(ix->ei_block))
797                                 break;
798                         chix = ix;
799                 }
800                 BUG_ON(chix != path->p_idx);
801         }
802 #endif
803 
804 }
805 
806 /*
807  * ext4_ext_binsearch:
808  * binary search for closest extent of the given block
809  * the header must be checked before calling this
810  */
811 static void
812 ext4_ext_binsearch(struct inode *inode,
813                 struct ext4_ext_path *path, ext4_lblk_t block)
814 {
815         struct ext4_extent_header *eh = path->p_hdr;
816         struct ext4_extent *r, *l, *m;
817 
818         if (eh->eh_entries == 0) {
819                 /*
820                  * this leaf is empty:
821                  * we get such a leaf in split/add case
822                  */
823                 return;
824         }
825 
826         ext_debug(inode, "binsearch for %u:  ", block);
827 
828         l = EXT_FIRST_EXTENT(eh) + 1;
829         r = EXT_LAST_EXTENT(eh);
830 
831         while (l <= r) {
832                 m = l + (r - l) / 2;
833                 ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
834                           le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
835                           r, le32_to_cpu(r->ee_block));
836 
837                 if (block < le32_to_cpu(m->ee_block))
838                         r = m - 1;
839                 else
840                         l = m + 1;
841         }
842 
843         path->p_ext = l - 1;
844         ext_debug(inode, "  -> %d:%llu:[%d]%d ",
845                         le32_to_cpu(path->p_ext->ee_block),
846                         ext4_ext_pblock(path->p_ext),
847                         ext4_ext_is_unwritten(path->p_ext),
848                         ext4_ext_get_actual_len(path->p_ext));
849 
850 #ifdef CHECK_BINSEARCH
851         {
852                 struct ext4_extent *chex, *ex;
853                 int k;
854 
855                 chex = ex = EXT_FIRST_EXTENT(eh);
856                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
857                         BUG_ON(k && le32_to_cpu(ex->ee_block)
858                                           <= le32_to_cpu(ex[-1].ee_block));
859                         if (block < le32_to_cpu(ex->ee_block))
860                                 break;
861                         chex = ex;
862                 }
863                 BUG_ON(chex != path->p_ext);
864         }
865 #endif
866 
867 }
868 
869 void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
870 {
871         struct ext4_extent_header *eh;
872 
873         eh = ext_inode_hdr(inode);
874         eh->eh_depth = 0;
875         eh->eh_entries = 0;
876         eh->eh_magic = EXT4_EXT_MAGIC;
877         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
878         eh->eh_generation = 0;
879         ext4_mark_inode_dirty(handle, inode);
880 }
881 
882 struct ext4_ext_path *
883 ext4_find_extent(struct inode *inode, ext4_lblk_t block,
884                  struct ext4_ext_path **orig_path, int flags)
885 {
886         struct ext4_extent_header *eh;
887         struct buffer_head *bh;
888         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
889         short int depth, i, ppos = 0;
890         int ret;
891         gfp_t gfp_flags = GFP_NOFS;
892 
893         if (flags & EXT4_EX_NOFAIL)
894                 gfp_flags |= __GFP_NOFAIL;
895 
896         eh = ext_inode_hdr(inode);
897         depth = ext_depth(inode);
898         if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
899                 EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
900                                  depth);
901                 ret = -EFSCORRUPTED;
902                 goto err;
903         }
904 
905         if (path) {
906                 ext4_ext_drop_refs(path);
907                 if (depth > path[0].p_maxdepth) {
908                         kfree(path);
909                         *orig_path = path = NULL;
910                 }
911         }
912         if (!path) {
913                 /* account possible depth increase */
914                 path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
915                                 gfp_flags);
916                 if (unlikely(!path))
917                         return ERR_PTR(-ENOMEM);
918                 path[0].p_maxdepth = depth + 1;
919         }
920         path[0].p_hdr = eh;
921         path[0].p_bh = NULL;
922 
923         i = depth;
924         if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
925                 ext4_cache_extents(inode, eh);
926         /* walk through the tree */
927         while (i) {
928                 ext_debug(inode, "depth %d: num %d, max %d\n",
929                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
930 
931                 ext4_ext_binsearch_idx(inode, path + ppos, block);
932                 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
933                 path[ppos].p_depth = i;
934                 path[ppos].p_ext = NULL;
935 
936                 bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
937                 if (IS_ERR(bh)) {
938                         ret = PTR_ERR(bh);
939                         goto err;
940                 }
941 
942                 eh = ext_block_hdr(bh);
943                 ppos++;
944                 path[ppos].p_bh = bh;
945                 path[ppos].p_hdr = eh;
946         }
947 
948         path[ppos].p_depth = i;
949         path[ppos].p_ext = NULL;
950         path[ppos].p_idx = NULL;
951 
952         /* find extent */
953         ext4_ext_binsearch(inode, path + ppos, block);
954         /* if not an empty leaf */
955         if (path[ppos].p_ext)
956                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
957 
958         ext4_ext_show_path(inode, path);
959 
960         if (orig_path)
961                 *orig_path = path;
962         return path;
963 
964 err:
965         ext4_free_ext_path(path);
966         if (orig_path)
967                 *orig_path = NULL;
968         return ERR_PTR(ret);
969 }
970 
971 /*
972  * ext4_ext_insert_index:
973  * insert new index [@logical;@ptr] into the block at @curp;
974  * check where to insert: before @curp or after @curp
975  */
976 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
977                                  struct ext4_ext_path *curp,
978                                  int logical, ext4_fsblk_t ptr)
979 {
980         struct ext4_extent_idx *ix;
981         int len, err;
982 
983         err = ext4_ext_get_access(handle, inode, curp);
984         if (err)
985                 return err;
986 
987         if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
988                 EXT4_ERROR_INODE(inode,
989                                  "logical %d == ei_block %d!",
990                                  logical, le32_to_cpu(curp->p_idx->ei_block));
991                 return -EFSCORRUPTED;
992         }
993 
994         if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
995                              >= le16_to_cpu(curp->p_hdr->eh_max))) {
996                 EXT4_ERROR_INODE(inode,
997                                  "eh_entries %d >= eh_max %d!",
998                                  le16_to_cpu(curp->p_hdr->eh_entries),
999                                  le16_to_cpu(curp->p_hdr->eh_max));
1000                 return -EFSCORRUPTED;
1001         }
1002 
1003         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
1004                 /* insert after */
1005                 ext_debug(inode, "insert new index %d after: %llu\n",
1006                           logical, ptr);
1007                 ix = curp->p_idx + 1;
1008         } else {
1009                 /* insert before */
1010                 ext_debug(inode, "insert new index %d before: %llu\n",
1011                           logical, ptr);
1012                 ix = curp->p_idx;
1013         }
1014 
1015         if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
1016                 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
1017                 return -EFSCORRUPTED;
1018         }
1019 
1020         len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
1021         BUG_ON(len < 0);
1022         if (len > 0) {
1023                 ext_debug(inode, "insert new index %d: "
1024                                 "move %d indices from 0x%p to 0x%p\n",
1025                                 logical, len, ix, ix + 1);
1026                 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
1027         }
1028 
1029         ix->ei_block = cpu_to_le32(logical);
1030         ext4_idx_store_pblock(ix, ptr);
1031         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
1032 
1033         if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
1034                 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
1035                 return -EFSCORRUPTED;
1036         }
1037 
1038         err = ext4_ext_dirty(handle, inode, curp);
1039         ext4_std_error(inode->i_sb, err);
1040 
1041         return err;
1042 }
1043 
1044 /*
1045  * ext4_ext_split:
1046  * inserts new subtree into the path, using free index entry
1047  * at depth @at:
1048  * - allocates all needed blocks (new leaf and all intermediate index blocks)
1049  * - makes decision where to split
1050  * - moves remaining extents and index entries (right to the split point)
1051  *   into the newly allocated blocks
1052  * - initializes subtree
1053  */
1054 static int ext4_ext_split(handle_t *handle, struct inode *inode,
1055                           unsigned int flags,
1056                           struct ext4_ext_path *path,
1057                           struct ext4_extent *newext, int at)
1058 {
1059         struct buffer_head *bh = NULL;
1060         int depth = ext_depth(inode);
1061         struct ext4_extent_header *neh;
1062         struct ext4_extent_idx *fidx;
1063         int i = at, k, m, a;
1064         ext4_fsblk_t newblock, oldblock;
1065         __le32 border;
1066         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1067         gfp_t gfp_flags = GFP_NOFS;
1068         int err = 0;
1069         size_t ext_size = 0;
1070 
1071         if (flags & EXT4_EX_NOFAIL)
1072                 gfp_flags |= __GFP_NOFAIL;
1073 
1074         /* make decision: where to split? */
1075         /* FIXME: now decision is simplest: at current extent */
1076 
1077         /* if current leaf will be split, then we should use
1078          * border from split point */
1079         if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1080                 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1081                 return -EFSCORRUPTED;
1082         }
1083         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1084                 border = path[depth].p_ext[1].ee_block;
1085                 ext_debug(inode, "leaf will be split."
1086                                 " next leaf starts at %d\n",
1087                                   le32_to_cpu(border));
1088         } else {
1089                 border = newext->ee_block;
1090                 ext_debug(inode, "leaf will be added."
1091                                 " next leaf starts at %d\n",
1092                                 le32_to_cpu(border));
1093         }
1094 
1095         /*
1096          * If error occurs, then we break processing
1097          * and mark filesystem read-only. index won't
1098          * be inserted and tree will be in consistent
1099          * state. Next mount will repair buffers too.
1100          */
1101 
1102         /*
1103          * Get array to track all allocated blocks.
1104          * We need this to handle errors and free blocks
1105          * upon them.
1106          */
1107         ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
1108         if (!ablocks)
1109                 return -ENOMEM;
1110 
1111         /* allocate all needed blocks */
1112         ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
1113         for (a = 0; a < depth - at; a++) {
1114                 newblock = ext4_ext_new_meta_block(handle, inode, path,
1115                                                    newext, &err, flags);
1116                 if (newblock == 0)
1117                         goto cleanup;
1118                 ablocks[a] = newblock;
1119         }
1120 
1121         /* initialize new leaf */
1122         newblock = ablocks[--a];
1123         if (unlikely(newblock == 0)) {
1124                 EXT4_ERROR_INODE(inode, "newblock == 0!");
1125                 err = -EFSCORRUPTED;
1126                 goto cleanup;
1127         }
1128         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1129         if (unlikely(!bh)) {
1130                 err = -ENOMEM;
1131                 goto cleanup;
1132         }
1133         lock_buffer(bh);
1134 
1135         err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
1136                                              EXT4_JTR_NONE);
1137         if (err)
1138                 goto cleanup;
1139 
1140         neh = ext_block_hdr(bh);
1141         neh->eh_entries = 0;
1142         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1143         neh->eh_magic = EXT4_EXT_MAGIC;
1144         neh->eh_depth = 0;
1145         neh->eh_generation = 0;
1146 
1147         /* move remainder of path[depth] to the new leaf */
1148         if (unlikely(path[depth].p_hdr->eh_entries !=
1149                      path[depth].p_hdr->eh_max)) {
1150                 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1151                                  path[depth].p_hdr->eh_entries,
1152                                  path[depth].p_hdr->eh_max);
1153                 err = -EFSCORRUPTED;
1154                 goto cleanup;
1155         }
1156         /* start copy from next extent */
1157         m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1158         ext4_ext_show_move(inode, path, newblock, depth);
1159         if (m) {
1160                 struct ext4_extent *ex;
1161                 ex = EXT_FIRST_EXTENT(neh);
1162                 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1163                 le16_add_cpu(&neh->eh_entries, m);
1164         }
1165 
1166         /* zero out unused area in the extent block */
1167         ext_size = sizeof(struct ext4_extent_header) +
1168                 sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1169         memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1170         ext4_extent_block_csum_set(inode, neh);
1171         set_buffer_uptodate(bh);
1172         unlock_buffer(bh);
1173 
1174         err = ext4_handle_dirty_metadata(handle, inode, bh);
1175         if (err)
1176                 goto cleanup;
1177         brelse(bh);
1178         bh = NULL;
1179 
1180         /* correct old leaf */
1181         if (m) {
1182                 err = ext4_ext_get_access(handle, inode, path + depth);
1183                 if (err)
1184                         goto cleanup;
1185                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1186                 err = ext4_ext_dirty(handle, inode, path + depth);
1187                 if (err)
1188                         goto cleanup;
1189 
1190         }
1191 
1192         /* create intermediate indexes */
1193         k = depth - at - 1;
1194         if (unlikely(k < 0)) {
1195                 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1196                 err = -EFSCORRUPTED;
1197                 goto cleanup;
1198         }
1199         if (k)
1200                 ext_debug(inode, "create %d intermediate indices\n", k);
1201         /* insert new index into current index block */
1202         /* current depth stored in i var */
1203         i = depth - 1;
1204         while (k--) {
1205                 oldblock = newblock;
1206                 newblock = ablocks[--a];
1207                 bh = sb_getblk(inode->i_sb, newblock);
1208                 if (unlikely(!bh)) {
1209                         err = -ENOMEM;
1210                         goto cleanup;
1211                 }
1212                 lock_buffer(bh);
1213 
1214                 err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
1215                                                      EXT4_JTR_NONE);
1216                 if (err)
1217                         goto cleanup;
1218 
1219                 neh = ext_block_hdr(bh);
1220                 neh->eh_entries = cpu_to_le16(1);
1221                 neh->eh_magic = EXT4_EXT_MAGIC;
1222                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1223                 neh->eh_depth = cpu_to_le16(depth - i);
1224                 neh->eh_generation = 0;
1225                 fidx = EXT_FIRST_INDEX(neh);
1226                 fidx->ei_block = border;
1227                 ext4_idx_store_pblock(fidx, oldblock);
1228 
1229                 ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
1230                                 i, newblock, le32_to_cpu(border), oldblock);
1231 
1232                 /* move remainder of path[i] to the new index block */
1233                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1234                                         EXT_LAST_INDEX(path[i].p_hdr))) {
1235                         EXT4_ERROR_INODE(inode,
1236                                          "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1237                                          le32_to_cpu(path[i].p_ext->ee_block));
1238                         err = -EFSCORRUPTED;
1239                         goto cleanup;
1240                 }
1241                 /* start copy indexes */
1242                 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1243                 ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
1244                                 EXT_MAX_INDEX(path[i].p_hdr));
1245                 ext4_ext_show_move(inode, path, newblock, i);
1246                 if (m) {
1247                         memmove(++fidx, path[i].p_idx,
1248                                 sizeof(struct ext4_extent_idx) * m);
1249                         le16_add_cpu(&neh->eh_entries, m);
1250                 }
1251                 /* zero out unused area in the extent block */
1252                 ext_size = sizeof(struct ext4_extent_header) +
1253                    (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1254                 memset(bh->b_data + ext_size, 0,
1255                         inode->i_sb->s_blocksize - ext_size);
1256                 ext4_extent_block_csum_set(inode, neh);
1257                 set_buffer_uptodate(bh);
1258                 unlock_buffer(bh);
1259 
1260                 err = ext4_handle_dirty_metadata(handle, inode, bh);
1261                 if (err)
1262                         goto cleanup;
1263                 brelse(bh);
1264                 bh = NULL;
1265 
1266                 /* correct old index */
1267                 if (m) {
1268                         err = ext4_ext_get_access(handle, inode, path + i);
1269                         if (err)
1270                                 goto cleanup;
1271                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1272                         err = ext4_ext_dirty(handle, inode, path + i);
1273                         if (err)
1274                                 goto cleanup;
1275                 }
1276 
1277                 i--;
1278         }
1279 
1280         /* insert new index */
1281         err = ext4_ext_insert_index(handle, inode, path + at,
1282                                     le32_to_cpu(border), newblock);
1283 
1284 cleanup:
1285         if (bh) {
1286                 if (buffer_locked(bh))
1287                         unlock_buffer(bh);
1288                 brelse(bh);
1289         }
1290 
1291         if (err) {
1292                 /* free all allocated blocks in error case */
1293                 for (i = 0; i < depth; i++) {
1294                         if (!ablocks[i])
1295                                 continue;
1296                         ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1297                                          EXT4_FREE_BLOCKS_METADATA);
1298                 }
1299         }
1300         kfree(ablocks);
1301 
1302         return err;
1303 }
1304 
1305 /*
1306  * ext4_ext_grow_indepth:
1307  * implements tree growing procedure:
1308  * - allocates new block
1309  * - moves top-level data (index block or leaf) into the new block
1310  * - initializes new top-level, creating index that points to the
1311  *   just created block
1312  */
1313 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1314                                  unsigned int flags)
1315 {
1316         struct ext4_extent_header *neh;
1317         struct buffer_head *bh;
1318         ext4_fsblk_t newblock, goal = 0;
1319         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1320         int err = 0;
1321         size_t ext_size = 0;
1322 
1323         /* Try to prepend new index to old one */
1324         if (ext_depth(inode))
1325                 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1326         if (goal > le32_to_cpu(es->s_first_data_block)) {
1327                 flags |= EXT4_MB_HINT_TRY_GOAL;
1328                 goal--;
1329         } else
1330                 goal = ext4_inode_to_goal_block(inode);
1331         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1332                                         NULL, &err);
1333         if (newblock == 0)
1334                 return err;
1335 
1336         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1337         if (unlikely(!bh))
1338                 return -ENOMEM;
1339         lock_buffer(bh);
1340 
1341         err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
1342                                              EXT4_JTR_NONE);
1343         if (err) {
1344                 unlock_buffer(bh);
1345                 goto out;
1346         }
1347 
1348         ext_size = sizeof(EXT4_I(inode)->i_data);
1349         /* move top-level index/leaf into new block */
1350         memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1351         /* zero out unused area in the extent block */
1352         memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1353 
1354         /* set size of new block */
1355         neh = ext_block_hdr(bh);
1356         /* old root could have indexes or leaves
1357          * so calculate e_max right way */
1358         if (ext_depth(inode))
1359                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1360         else
1361                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1362         neh->eh_magic = EXT4_EXT_MAGIC;
1363         ext4_extent_block_csum_set(inode, neh);
1364         set_buffer_uptodate(bh);
1365         set_buffer_verified(bh);
1366         unlock_buffer(bh);
1367 
1368         err = ext4_handle_dirty_metadata(handle, inode, bh);
1369         if (err)
1370                 goto out;
1371 
1372         /* Update top-level index: num,max,pointer */
1373         neh = ext_inode_hdr(inode);
1374         neh->eh_entries = cpu_to_le16(1);
1375         ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1376         if (neh->eh_depth == 0) {
1377                 /* Root extent block becomes index block */
1378                 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1379                 EXT_FIRST_INDEX(neh)->ei_block =
1380                         EXT_FIRST_EXTENT(neh)->ee_block;
1381         }
1382         ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
1383                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1384                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1385                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1386 
1387         le16_add_cpu(&neh->eh_depth, 1);
1388         err = ext4_mark_inode_dirty(handle, inode);
1389 out:
1390         brelse(bh);
1391 
1392         return err;
1393 }
1394 
1395 /*
1396  * ext4_ext_create_new_leaf:
1397  * finds empty index and adds new leaf.
1398  * if no free index is found, then it requests in-depth growing.
1399  */
1400 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1401                                     unsigned int mb_flags,
1402                                     unsigned int gb_flags,
1403                                     struct ext4_ext_path **ppath,
1404                                     struct ext4_extent *newext)
1405 {
1406         struct ext4_ext_path *path = *ppath;
1407         struct ext4_ext_path *curp;
1408         int depth, i, err = 0;
1409 
1410 repeat:
1411         i = depth = ext_depth(inode);
1412 
1413         /* walk up to the tree and look for free index entry */
1414         curp = path + depth;
1415         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1416                 i--;
1417                 curp--;
1418         }
1419 
1420         /* we use already allocated block for index block,
1421          * so subsequent data blocks should be contiguous */
1422         if (EXT_HAS_FREE_INDEX(curp)) {
1423                 /* if we found index with free entry, then use that
1424                  * entry: create all needed subtree and add new leaf */
1425                 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1426                 if (err)
1427                         goto out;
1428 
1429                 /* refill path */
1430                 path = ext4_find_extent(inode,
1431                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1432                                     ppath, gb_flags);
1433                 if (IS_ERR(path))
1434                         err = PTR_ERR(path);
1435         } else {
1436                 /* tree is full, time to grow in depth */
1437                 err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1438                 if (err)
1439                         goto out;
1440 
1441                 /* refill path */
1442                 path = ext4_find_extent(inode,
1443                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1444                                     ppath, gb_flags);
1445                 if (IS_ERR(path)) {
1446                         err = PTR_ERR(path);
1447                         goto out;
1448                 }
1449 
1450                 /*
1451                  * only first (depth 0 -> 1) produces free space;
1452                  * in all other cases we have to split the grown tree
1453                  */
1454                 depth = ext_depth(inode);
1455                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1456                         /* now we need to split */
1457                         goto repeat;
1458                 }
1459         }
1460 
1461 out:
1462         return err;
1463 }
1464 
1465 /*
1466  * search the closest allocated block to the left for *logical
1467  * and returns it at @logical + it's physical address at @phys
1468  * if *logical is the smallest allocated block, the function
1469  * returns 0 at @phys
1470  * return value contains 0 (success) or error code
1471  */
1472 static int ext4_ext_search_left(struct inode *inode,
1473                                 struct ext4_ext_path *path,
1474                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1475 {
1476         struct ext4_extent_idx *ix;
1477         struct ext4_extent *ex;
1478         int depth, ee_len;
1479 
1480         if (unlikely(path == NULL)) {
1481                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1482                 return -EFSCORRUPTED;
1483         }
1484         depth = path->p_depth;
1485         *phys = 0;
1486 
1487         if (depth == 0 && path->p_ext == NULL)
1488                 return 0;
1489 
1490         /* usually extent in the path covers blocks smaller
1491          * then *logical, but it can be that extent is the
1492          * first one in the file */
1493 
1494         ex = path[depth].p_ext;
1495         ee_len = ext4_ext_get_actual_len(ex);
1496         if (*logical < le32_to_cpu(ex->ee_block)) {
1497                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1498                         EXT4_ERROR_INODE(inode,
1499                                          "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1500                                          *logical, le32_to_cpu(ex->ee_block));
1501                         return -EFSCORRUPTED;
1502                 }
1503                 while (--depth >= 0) {
1504                         ix = path[depth].p_idx;
1505                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1506                                 EXT4_ERROR_INODE(inode,
1507                                   "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1508                                   ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1509                                   le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
1510                                   depth);
1511                                 return -EFSCORRUPTED;
1512                         }
1513                 }
1514                 return 0;
1515         }
1516 
1517         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1518                 EXT4_ERROR_INODE(inode,
1519                                  "logical %d < ee_block %d + ee_len %d!",
1520                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1521                 return -EFSCORRUPTED;
1522         }
1523 
1524         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1525         *phys = ext4_ext_pblock(ex) + ee_len - 1;
1526         return 0;
1527 }
1528 
1529 /*
1530  * Search the closest allocated block to the right for *logical
1531  * and returns it at @logical + it's physical address at @phys.
1532  * If not exists, return 0 and @phys is set to 0. We will return
1533  * 1 which means we found an allocated block and ret_ex is valid.
1534  * Or return a (< 0) error code.
1535  */
1536 static int ext4_ext_search_right(struct inode *inode,
1537                                  struct ext4_ext_path *path,
1538                                  ext4_lblk_t *logical, ext4_fsblk_t *phys,
1539                                  struct ext4_extent *ret_ex)
1540 {
1541         struct buffer_head *bh = NULL;
1542         struct ext4_extent_header *eh;
1543         struct ext4_extent_idx *ix;
1544         struct ext4_extent *ex;
1545         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1546         int ee_len;
1547 
1548         if (unlikely(path == NULL)) {
1549                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1550                 return -EFSCORRUPTED;
1551         }
1552         depth = path->p_depth;
1553         *phys = 0;
1554 
1555         if (depth == 0 && path->p_ext == NULL)
1556                 return 0;
1557 
1558         /* usually extent in the path covers blocks smaller
1559          * then *logical, but it can be that extent is the
1560          * first one in the file */
1561 
1562         ex = path[depth].p_ext;
1563         ee_len = ext4_ext_get_actual_len(ex);
1564         if (*logical < le32_to_cpu(ex->ee_block)) {
1565                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1566                         EXT4_ERROR_INODE(inode,
1567                                          "first_extent(path[%d].p_hdr) != ex",
1568                                          depth);
1569                         return -EFSCORRUPTED;
1570                 }
1571                 while (--depth >= 0) {
1572                         ix = path[depth].p_idx;
1573                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1574                                 EXT4_ERROR_INODE(inode,
1575                                                  "ix != EXT_FIRST_INDEX *logical %d!",
1576                                                  *logical);
1577                                 return -EFSCORRUPTED;
1578                         }
1579                 }
1580                 goto found_extent;
1581         }
1582 
1583         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1584                 EXT4_ERROR_INODE(inode,
1585                                  "logical %d < ee_block %d + ee_len %d!",
1586                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1587                 return -EFSCORRUPTED;
1588         }
1589 
1590         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1591                 /* next allocated block in this leaf */
1592                 ex++;
1593                 goto found_extent;
1594         }
1595 
1596         /* go up and search for index to the right */
1597         while (--depth >= 0) {
1598                 ix = path[depth].p_idx;
1599                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1600                         goto got_index;
1601         }
1602 
1603         /* we've gone up to the root and found no index to the right */
1604         return 0;
1605 
1606 got_index:
1607         /* we've found index to the right, let's
1608          * follow it and find the closest allocated
1609          * block to the right */
1610         ix++;
1611         while (++depth < path->p_depth) {
1612                 /* subtract from p_depth to get proper eh_depth */
1613                 bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
1614                 if (IS_ERR(bh))
1615                         return PTR_ERR(bh);
1616                 eh = ext_block_hdr(bh);
1617                 ix = EXT_FIRST_INDEX(eh);
1618                 put_bh(bh);
1619         }
1620 
1621         bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
1622         if (IS_ERR(bh))
1623                 return PTR_ERR(bh);
1624         eh = ext_block_hdr(bh);
1625         ex = EXT_FIRST_EXTENT(eh);
1626 found_extent:
1627         *logical = le32_to_cpu(ex->ee_block);
1628         *phys = ext4_ext_pblock(ex);
1629         if (ret_ex)
1630                 *ret_ex = *ex;
1631         if (bh)
1632                 put_bh(bh);
1633         return 1;
1634 }
1635 
1636 /*
1637  * ext4_ext_next_allocated_block:
1638  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1639  * NOTE: it considers block number from index entry as
1640  * allocated block. Thus, index entries have to be consistent
1641  * with leaves.
1642  */
1643 ext4_lblk_t
1644 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1645 {
1646         int depth;
1647 
1648         BUG_ON(path == NULL);
1649         depth = path->p_depth;
1650 
1651         if (depth == 0 && path->p_ext == NULL)
1652                 return EXT_MAX_BLOCKS;
1653 
1654         while (depth >= 0) {
1655                 struct ext4_ext_path *p = &path[depth];
1656 
1657                 if (depth == path->p_depth) {
1658                         /* leaf */
1659                         if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1660                                 return le32_to_cpu(p->p_ext[1].ee_block);
1661                 } else {
1662                         /* index */
1663                         if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1664                                 return le32_to_cpu(p->p_idx[1].ei_block);
1665                 }
1666                 depth--;
1667         }
1668 
1669         return EXT_MAX_BLOCKS;
1670 }
1671 
1672 /*
1673  * ext4_ext_next_leaf_block:
1674  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1675  */
1676 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1677 {
1678         int depth;
1679 
1680         BUG_ON(path == NULL);
1681         depth = path->p_depth;
1682 
1683         /* zero-tree has no leaf blocks at all */
1684         if (depth == 0)
1685                 return EXT_MAX_BLOCKS;
1686 
1687         /* go to index block */
1688         depth--;
1689 
1690         while (depth >= 0) {
1691                 if (path[depth].p_idx !=
1692                                 EXT_LAST_INDEX(path[depth].p_hdr))
1693                         return (ext4_lblk_t)
1694                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1695                 depth--;
1696         }
1697 
1698         return EXT_MAX_BLOCKS;
1699 }
1700 
1701 /*
1702  * ext4_ext_correct_indexes:
1703  * if leaf gets modified and modified extent is first in the leaf,
1704  * then we have to correct all indexes above.
1705  * TODO: do we need to correct tree in all cases?
1706  */
1707 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1708                                 struct ext4_ext_path *path)
1709 {
1710         struct ext4_extent_header *eh;
1711         int depth = ext_depth(inode);
1712         struct ext4_extent *ex;
1713         __le32 border;
1714         int k, err = 0;
1715 
1716         eh = path[depth].p_hdr;
1717         ex = path[depth].p_ext;
1718 
1719         if (unlikely(ex == NULL || eh == NULL)) {
1720                 EXT4_ERROR_INODE(inode,
1721                                  "ex %p == NULL or eh %p == NULL", ex, eh);
1722                 return -EFSCORRUPTED;
1723         }
1724 
1725         if (depth == 0) {
1726                 /* there is no tree at all */
1727                 return 0;
1728         }
1729 
1730         if (ex != EXT_FIRST_EXTENT(eh)) {
1731                 /* we correct tree if first leaf got modified only */
1732                 return 0;
1733         }
1734 
1735         /*
1736          * TODO: we need correction if border is smaller than current one
1737          */
1738         k = depth - 1;
1739         border = path[depth].p_ext->ee_block;
1740         err = ext4_ext_get_access(handle, inode, path + k);
1741         if (err)
1742                 return err;
1743         path[k].p_idx->ei_block = border;
1744         err = ext4_ext_dirty(handle, inode, path + k);
1745         if (err)
1746                 return err;
1747 
1748         while (k--) {
1749                 /* change all left-side indexes */
1750                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1751                         break;
1752                 err = ext4_ext_get_access(handle, inode, path + k);
1753                 if (err)
1754                         break;
1755                 path[k].p_idx->ei_block = border;
1756                 err = ext4_ext_dirty(handle, inode, path + k);
1757                 if (err)
1758                         break;
1759         }
1760 
1761         return err;
1762 }
1763 
1764 static int ext4_can_extents_be_merged(struct inode *inode,
1765                                       struct ext4_extent *ex1,
1766                                       struct ext4_extent *ex2)
1767 {
1768         unsigned short ext1_ee_len, ext2_ee_len;
1769 
1770         if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1771                 return 0;
1772 
1773         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1774         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1775 
1776         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1777                         le32_to_cpu(ex2->ee_block))
1778                 return 0;
1779 
1780         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1781                 return 0;
1782 
1783         if (ext4_ext_is_unwritten(ex1) &&
1784             ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1785                 return 0;
1786 #ifdef AGGRESSIVE_TEST
1787         if (ext1_ee_len >= 4)
1788                 return 0;
1789 #endif
1790 
1791         if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1792                 return 1;
1793         return 0;
1794 }
1795 
1796 /*
1797  * This function tries to merge the "ex" extent to the next extent in the tree.
1798  * It always tries to merge towards right. If you want to merge towards
1799  * left, pass "ex - 1" as argument instead of "ex".
1800  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1801  * 1 if they got merged.
1802  */
1803 static int ext4_ext_try_to_merge_right(struct inode *inode,
1804                                  struct ext4_ext_path *path,
1805                                  struct ext4_extent *ex)
1806 {
1807         struct ext4_extent_header *eh;
1808         unsigned int depth, len;
1809         int merge_done = 0, unwritten;
1810 
1811         depth = ext_depth(inode);
1812         BUG_ON(path[depth].p_hdr == NULL);
1813         eh = path[depth].p_hdr;
1814 
1815         while (ex < EXT_LAST_EXTENT(eh)) {
1816                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1817                         break;
1818                 /* merge with next extent! */
1819                 unwritten = ext4_ext_is_unwritten(ex);
1820                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1821                                 + ext4_ext_get_actual_len(ex + 1));
1822                 if (unwritten)
1823                         ext4_ext_mark_unwritten(ex);
1824 
1825                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1826                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1827                                 * sizeof(struct ext4_extent);
1828                         memmove(ex + 1, ex + 2, len);
1829                 }
1830                 le16_add_cpu(&eh->eh_entries, -1);
1831                 merge_done = 1;
1832                 WARN_ON(eh->eh_entries == 0);
1833                 if (!eh->eh_entries)
1834                         EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1835         }
1836 
1837         return merge_done;
1838 }
1839 
1840 /*
1841  * This function does a very simple check to see if we can collapse
1842  * an extent tree with a single extent tree leaf block into the inode.
1843  */
1844 static void ext4_ext_try_to_merge_up(handle_t *handle,
1845                                      struct inode *inode,
1846                                      struct ext4_ext_path *path)
1847 {
1848         size_t s;
1849         unsigned max_root = ext4_ext_space_root(inode, 0);
1850         ext4_fsblk_t blk;
1851 
1852         if ((path[0].p_depth != 1) ||
1853             (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1854             (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1855                 return;
1856 
1857         /*
1858          * We need to modify the block allocation bitmap and the block
1859          * group descriptor to release the extent tree block.  If we
1860          * can't get the journal credits, give up.
1861          */
1862         if (ext4_journal_extend(handle, 2,
1863                         ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1864                 return;
1865 
1866         /*
1867          * Copy the extent data up to the inode
1868          */
1869         blk = ext4_idx_pblock(path[0].p_idx);
1870         s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1871                 sizeof(struct ext4_extent_idx);
1872         s += sizeof(struct ext4_extent_header);
1873 
1874         path[1].p_maxdepth = path[0].p_maxdepth;
1875         memcpy(path[0].p_hdr, path[1].p_hdr, s);
1876         path[0].p_depth = 0;
1877         path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1878                 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1879         path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1880 
1881         brelse(path[1].p_bh);
1882         path[1].p_bh = NULL;
1883         ext4_free_blocks(handle, inode, NULL, blk, 1,
1884                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1885 }
1886 
1887 /*
1888  * This function tries to merge the @ex extent to neighbours in the tree, then
1889  * tries to collapse the extent tree into the inode.
1890  */
1891 static void ext4_ext_try_to_merge(handle_t *handle,
1892                                   struct inode *inode,
1893                                   struct ext4_ext_path *path,
1894                                   struct ext4_extent *ex)
1895 {
1896         struct ext4_extent_header *eh;
1897         unsigned int depth;
1898         int merge_done = 0;
1899 
1900         depth = ext_depth(inode);
1901         BUG_ON(path[depth].p_hdr == NULL);
1902         eh = path[depth].p_hdr;
1903 
1904         if (ex > EXT_FIRST_EXTENT(eh))
1905                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1906 
1907         if (!merge_done)
1908                 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1909 
1910         ext4_ext_try_to_merge_up(handle, inode, path);
1911 }
1912 
1913 /*
1914  * check if a portion of the "newext" extent overlaps with an
1915  * existing extent.
1916  *
1917  * If there is an overlap discovered, it updates the length of the newext
1918  * such that there will be no overlap, and then returns 1.
1919  * If there is no overlap found, it returns 0.
1920  */
1921 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1922                                            struct inode *inode,
1923                                            struct ext4_extent *newext,
1924                                            struct ext4_ext_path *path)
1925 {
1926         ext4_lblk_t b1, b2;
1927         unsigned int depth, len1;
1928         unsigned int ret = 0;
1929 
1930         b1 = le32_to_cpu(newext->ee_block);
1931         len1 = ext4_ext_get_actual_len(newext);
1932         depth = ext_depth(inode);
1933         if (!path[depth].p_ext)
1934                 goto out;
1935         b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1936 
1937         /*
1938          * get the next allocated block if the extent in the path
1939          * is before the requested block(s)
1940          */
1941         if (b2 < b1) {
1942                 b2 = ext4_ext_next_allocated_block(path);
1943                 if (b2 == EXT_MAX_BLOCKS)
1944                         goto out;
1945                 b2 = EXT4_LBLK_CMASK(sbi, b2);
1946         }
1947 
1948         /* check for wrap through zero on extent logical start block*/
1949         if (b1 + len1 < b1) {
1950                 len1 = EXT_MAX_BLOCKS - b1;
1951                 newext->ee_len = cpu_to_le16(len1);
1952                 ret = 1;
1953         }
1954 
1955         /* check for overlap */
1956         if (b1 + len1 > b2) {
1957                 newext->ee_len = cpu_to_le16(b2 - b1);
1958                 ret = 1;
1959         }
1960 out:
1961         return ret;
1962 }
1963 
1964 /*
1965  * ext4_ext_insert_extent:
1966  * tries to merge requested extent into the existing extent or
1967  * inserts requested extent as new one into the tree,
1968  * creating new leaf in the no-space case.
1969  */
1970 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1971                                 struct ext4_ext_path **ppath,
1972                                 struct ext4_extent *newext, int gb_flags)
1973 {
1974         struct ext4_ext_path *path = *ppath;
1975         struct ext4_extent_header *eh;
1976         struct ext4_extent *ex, *fex;
1977         struct ext4_extent *nearex; /* nearest extent */
1978         struct ext4_ext_path *npath = NULL;
1979         int depth, len, err;
1980         ext4_lblk_t next;
1981         int mb_flags = 0, unwritten;
1982 
1983         if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1984                 mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1985         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1986                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1987                 return -EFSCORRUPTED;
1988         }
1989         depth = ext_depth(inode);
1990         ex = path[depth].p_ext;
1991         eh = path[depth].p_hdr;
1992         if (unlikely(path[depth].p_hdr == NULL)) {
1993                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1994                 return -EFSCORRUPTED;
1995         }
1996 
1997         /* try to insert block into found extent and return */
1998         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1999 
2000                 /*
2001                  * Try to see whether we should rather test the extent on
2002                  * right from ex, or from the left of ex. This is because
2003                  * ext4_find_extent() can return either extent on the
2004                  * left, or on the right from the searched position. This
2005                  * will make merging more effective.
2006                  */
2007                 if (ex < EXT_LAST_EXTENT(eh) &&
2008                     (le32_to_cpu(ex->ee_block) +
2009                     ext4_ext_get_actual_len(ex) <
2010                     le32_to_cpu(newext->ee_block))) {
2011                         ex += 1;
2012                         goto prepend;
2013                 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
2014                            (le32_to_cpu(newext->ee_block) +
2015                            ext4_ext_get_actual_len(newext) <
2016                            le32_to_cpu(ex->ee_block)))
2017                         ex -= 1;
2018 
2019                 /* Try to append newex to the ex */
2020                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
2021                         ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
2022                                   "(from %llu)\n",
2023                                   ext4_ext_is_unwritten(newext),
2024                                   ext4_ext_get_actual_len(newext),
2025                                   le32_to_cpu(ex->ee_block),
2026                                   ext4_ext_is_unwritten(ex),
2027                                   ext4_ext_get_actual_len(ex),
2028                                   ext4_ext_pblock(ex));
2029                         err = ext4_ext_get_access(handle, inode,
2030                                                   path + depth);
2031                         if (err)
2032                                 return err;
2033                         unwritten = ext4_ext_is_unwritten(ex);
2034                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2035                                         + ext4_ext_get_actual_len(newext));
2036                         if (unwritten)
2037                                 ext4_ext_mark_unwritten(ex);
2038                         nearex = ex;
2039                         goto merge;
2040                 }
2041 
2042 prepend:
2043                 /* Try to prepend newex to the ex */
2044                 if (ext4_can_extents_be_merged(inode, newext, ex)) {
2045                         ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
2046                                   "(from %llu)\n",
2047                                   le32_to_cpu(newext->ee_block),
2048                                   ext4_ext_is_unwritten(newext),
2049                                   ext4_ext_get_actual_len(newext),
2050                                   le32_to_cpu(ex->ee_block),
2051                                   ext4_ext_is_unwritten(ex),
2052                                   ext4_ext_get_actual_len(ex),
2053                                   ext4_ext_pblock(ex));
2054                         err = ext4_ext_get_access(handle, inode,
2055                                                   path + depth);
2056                         if (err)
2057                                 return err;
2058 
2059                         unwritten = ext4_ext_is_unwritten(ex);
2060                         ex->ee_block = newext->ee_block;
2061                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2062                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2063                                         + ext4_ext_get_actual_len(newext));
2064                         if (unwritten)
2065                                 ext4_ext_mark_unwritten(ex);
2066                         nearex = ex;
2067                         goto merge;
2068                 }
2069         }
2070 
2071         depth = ext_depth(inode);
2072         eh = path[depth].p_hdr;
2073         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2074                 goto has_space;
2075 
2076         /* probably next leaf has space for us? */
2077         fex = EXT_LAST_EXTENT(eh);
2078         next = EXT_MAX_BLOCKS;
2079         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2080                 next = ext4_ext_next_leaf_block(path);
2081         if (next != EXT_MAX_BLOCKS) {
2082                 ext_debug(inode, "next leaf block - %u\n", next);
2083                 BUG_ON(npath != NULL);
2084                 npath = ext4_find_extent(inode, next, NULL, gb_flags);
2085                 if (IS_ERR(npath))
2086                         return PTR_ERR(npath);
2087                 BUG_ON(npath->p_depth != path->p_depth);
2088                 eh = npath[depth].p_hdr;
2089                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2090                         ext_debug(inode, "next leaf isn't full(%d)\n",
2091                                   le16_to_cpu(eh->eh_entries));
2092                         path = npath;
2093                         goto has_space;
2094                 }
2095                 ext_debug(inode, "next leaf has no free space(%d,%d)\n",
2096                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2097         }
2098 
2099         /*
2100          * There is no free space in the found leaf.
2101          * We're gonna add a new leaf in the tree.
2102          */
2103         if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2104                 mb_flags |= EXT4_MB_USE_RESERVED;
2105         err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2106                                        ppath, newext);
2107         if (err)
2108                 goto cleanup;
2109         path = *ppath;
2110         depth = ext_depth(inode);
2111         eh = path[depth].p_hdr;
2112 
2113 has_space:
2114         nearex = path[depth].p_ext;
2115 
2116         err = ext4_ext_get_access(handle, inode, path + depth);
2117         if (err)
2118                 goto cleanup;
2119 
2120         if (!nearex) {
2121                 /* there is no extent in this leaf, create first one */
2122                 ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
2123                                 le32_to_cpu(newext->ee_block),
2124                                 ext4_ext_pblock(newext),
2125                                 ext4_ext_is_unwritten(newext),
2126                                 ext4_ext_get_actual_len(newext));
2127                 nearex = EXT_FIRST_EXTENT(eh);
2128         } else {
2129                 if (le32_to_cpu(newext->ee_block)
2130                            > le32_to_cpu(nearex->ee_block)) {
2131                         /* Insert after */
2132                         ext_debug(inode, "insert %u:%llu:[%d]%d before: "
2133                                         "nearest %p\n",
2134                                         le32_to_cpu(newext->ee_block),
2135                                         ext4_ext_pblock(newext),
2136                                         ext4_ext_is_unwritten(newext),
2137                                         ext4_ext_get_actual_len(newext),
2138                                         nearex);
2139                         nearex++;
2140                 } else {
2141                         /* Insert before */
2142                         BUG_ON(newext->ee_block == nearex->ee_block);
2143                         ext_debug(inode, "insert %u:%llu:[%d]%d after: "
2144                                         "nearest %p\n",
2145                                         le32_to_cpu(newext->ee_block),
2146                                         ext4_ext_pblock(newext),
2147                                         ext4_ext_is_unwritten(newext),
2148                                         ext4_ext_get_actual_len(newext),
2149                                         nearex);
2150                 }
2151                 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2152                 if (len > 0) {
2153                         ext_debug(inode, "insert %u:%llu:[%d]%d: "
2154                                         "move %d extents from 0x%p to 0x%p\n",
2155                                         le32_to_cpu(newext->ee_block),
2156                                         ext4_ext_pblock(newext),
2157                                         ext4_ext_is_unwritten(newext),
2158                                         ext4_ext_get_actual_len(newext),
2159                                         len, nearex, nearex + 1);
2160                         memmove(nearex + 1, nearex,
2161                                 len * sizeof(struct ext4_extent));
2162                 }
2163         }
2164 
2165         le16_add_cpu(&eh->eh_entries, 1);
2166         path[depth].p_ext = nearex;
2167         nearex->ee_block = newext->ee_block;
2168         ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2169         nearex->ee_len = newext->ee_len;
2170 
2171 merge:
2172         /* try to merge extents */
2173         if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2174                 ext4_ext_try_to_merge(handle, inode, path, nearex);
2175 
2176 
2177         /* time to correct all indexes above */
2178         err = ext4_ext_correct_indexes(handle, inode, path);
2179         if (err)
2180                 goto cleanup;
2181 
2182         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2183 
2184 cleanup:
2185         ext4_free_ext_path(npath);
2186         return err;
2187 }
2188 
2189 static int ext4_fill_es_cache_info(struct inode *inode,
2190                                    ext4_lblk_t block, ext4_lblk_t num,
2191                                    struct fiemap_extent_info *fieinfo)
2192 {
2193         ext4_lblk_t next, end = block + num - 1;
2194         struct extent_status es;
2195         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2196         unsigned int flags;
2197         int err;
2198 
2199         while (block <= end) {
2200                 next = 0;
2201                 flags = 0;
2202                 if (!ext4_es_lookup_extent(inode, block, &next, &es))
2203                         break;
2204                 if (ext4_es_is_unwritten(&es))
2205                         flags |= FIEMAP_EXTENT_UNWRITTEN;
2206                 if (ext4_es_is_delayed(&es))
2207                         flags |= (FIEMAP_EXTENT_DELALLOC |
2208                                   FIEMAP_EXTENT_UNKNOWN);
2209                 if (ext4_es_is_hole(&es))
2210                         flags |= EXT4_FIEMAP_EXTENT_HOLE;
2211                 if (next == 0)
2212                         flags |= FIEMAP_EXTENT_LAST;
2213                 if (flags & (FIEMAP_EXTENT_DELALLOC|
2214                              EXT4_FIEMAP_EXTENT_HOLE))
2215                         es.es_pblk = 0;
2216                 else
2217                         es.es_pblk = ext4_es_pblock(&es);
2218                 err = fiemap_fill_next_extent(fieinfo,
2219                                 (__u64)es.es_lblk << blksize_bits,
2220                                 (__u64)es.es_pblk << blksize_bits,
2221                                 (__u64)es.es_len << blksize_bits,
2222                                 flags);
2223                 if (next == 0)
2224                         break;
2225                 block = next;
2226                 if (err < 0)
2227                         return err;
2228                 if (err == 1)
2229                         return 0;
2230         }
2231         return 0;
2232 }
2233 
2234 
2235 /*
2236  * ext4_ext_find_hole - find hole around given block according to the given path
2237  * @inode:      inode we lookup in
2238  * @path:       path in extent tree to @lblk
2239  * @lblk:       pointer to logical block around which we want to determine hole
2240  *
2241  * Determine hole length (and start if easily possible) around given logical
2242  * block. We don't try too hard to find the beginning of the hole but @path
2243  * actually points to extent before @lblk, we provide it.
2244  *
2245  * The function returns the length of a hole starting at @lblk. We update @lblk
2246  * to the beginning of the hole if we managed to find it.
2247  */
2248 static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
2249                                       struct ext4_ext_path *path,
2250                                       ext4_lblk_t *lblk)
2251 {
2252         int depth = ext_depth(inode);
2253         struct ext4_extent *ex;
2254         ext4_lblk_t len;
2255 
2256         ex = path[depth].p_ext;
2257         if (ex == NULL) {
2258                 /* there is no extent yet, so gap is [0;-] */
2259                 *lblk = 0;
2260                 len = EXT_MAX_BLOCKS;
2261         } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2262                 len = le32_to_cpu(ex->ee_block) - *lblk;
2263         } else if (*lblk >= le32_to_cpu(ex->ee_block)
2264                         + ext4_ext_get_actual_len(ex)) {
2265                 ext4_lblk_t next;
2266 
2267                 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2268                 next = ext4_ext_next_allocated_block(path);
2269                 BUG_ON(next == *lblk);
2270                 len = next - *lblk;
2271         } else {
2272                 BUG();
2273         }
2274         return len;
2275 }
2276 
2277 /*
2278  * ext4_ext_rm_idx:
2279  * removes index from the index block.
2280  */
2281 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2282                         struct ext4_ext_path *path, int depth)
2283 {
2284         int err;
2285         ext4_fsblk_t leaf;
2286 
2287         /* free index block */
2288         depth--;
2289         path = path + depth;
2290         leaf = ext4_idx_pblock(path->p_idx);
2291         if (unlikely(path->p_hdr->eh_entries == 0)) {
2292                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2293                 return -EFSCORRUPTED;
2294         }
2295         err = ext4_ext_get_access(handle, inode, path);
2296         if (err)
2297                 return err;
2298 
2299         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2300                 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2301                 len *= sizeof(struct ext4_extent_idx);
2302                 memmove(path->p_idx, path->p_idx + 1, len);
2303         }
2304 
2305         le16_add_cpu(&path->p_hdr->eh_entries, -1);
2306         err = ext4_ext_dirty(handle, inode, path);
2307         if (err)
2308                 return err;
2309         ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
2310         trace_ext4_ext_rm_idx(inode, leaf);
2311 
2312         ext4_free_blocks(handle, inode, NULL, leaf, 1,
2313                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2314 
2315         while (--depth >= 0) {
2316                 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2317                         break;
2318                 path--;
2319                 err = ext4_ext_get_access(handle, inode, path);
2320                 if (err)
2321                         break;
2322                 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2323                 err = ext4_ext_dirty(handle, inode, path);
2324                 if (err)
2325                         break;
2326         }
2327         return err;
2328 }
2329 
2330 /*
2331  * ext4_ext_calc_credits_for_single_extent:
2332  * This routine returns max. credits that needed to insert an extent
2333  * to the extent tree.
2334  * When pass the actual path, the caller should calculate credits
2335  * under i_data_sem.
2336  */
2337 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2338                                                 struct ext4_ext_path *path)
2339 {
2340         if (path) {
2341                 int depth = ext_depth(inode);
2342                 int ret = 0;
2343 
2344                 /* probably there is space in leaf? */
2345                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2346                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2347 
2348                         /*
2349                          *  There are some space in the leaf tree, no
2350                          *  need to account for leaf block credit
2351                          *
2352                          *  bitmaps and block group descriptor blocks
2353                          *  and other metadata blocks still need to be
2354                          *  accounted.
2355                          */
2356                         /* 1 bitmap, 1 block group descriptor */
2357                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2358                         return ret;
2359                 }
2360         }
2361 
2362         return ext4_chunk_trans_blocks(inode, nrblocks);
2363 }
2364 
2365 /*
2366  * How many index/leaf blocks need to change/allocate to add @extents extents?
2367  *
2368  * If we add a single extent, then in the worse case, each tree level
2369  * index/leaf need to be changed in case of the tree split.
2370  *
2371  * If more extents are inserted, they could cause the whole tree split more
2372  * than once, but this is really rare.
2373  */
2374 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2375 {
2376         int index;
2377         int depth;
2378 
2379         /* If we are converting the inline data, only one is needed here. */
2380         if (ext4_has_inline_data(inode))
2381                 return 1;
2382 
2383         depth = ext_depth(inode);
2384 
2385         if (extents <= 1)
2386                 index = depth * 2;
2387         else
2388                 index = depth * 3;
2389 
2390         return index;
2391 }
2392 
2393 static inline int get_default_free_blocks_flags(struct inode *inode)
2394 {
2395         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2396             ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2397                 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2398         else if (ext4_should_journal_data(inode))
2399                 return EXT4_FREE_BLOCKS_FORGET;
2400         return 0;
2401 }
2402 
2403 /*
2404  * ext4_rereserve_cluster - increment the reserved cluster count when
2405  *                          freeing a cluster with a pending reservation
2406  *
2407  * @inode - file containing the cluster
2408  * @lblk - logical block in cluster to be reserved
2409  *
2410  * Increments the reserved cluster count and adjusts quota in a bigalloc
2411  * file system when freeing a partial cluster containing at least one
2412  * delayed and unwritten block.  A partial cluster meeting that
2413  * requirement will have a pending reservation.  If so, the
2414  * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2415  * defer reserved and allocated space accounting to a subsequent call
2416  * to this function.
2417  */
2418 static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2419 {
2420         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2421         struct ext4_inode_info *ei = EXT4_I(inode);
2422 
2423         dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2424 
2425         spin_lock(&ei->i_block_reservation_lock);
2426         ei->i_reserved_data_blocks++;
2427         percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2428         spin_unlock(&ei->i_block_reservation_lock);
2429 
2430         percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2431         ext4_remove_pending(inode, lblk);
2432 }
2433 
2434 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2435                               struct ext4_extent *ex,
2436                               struct partial_cluster *partial,
2437                               ext4_lblk_t from, ext4_lblk_t to)
2438 {
2439         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2440         unsigned short ee_len = ext4_ext_get_actual_len(ex);
2441         ext4_fsblk_t last_pblk, pblk;
2442         ext4_lblk_t num;
2443         int flags;
2444 
2445         /* only extent tail removal is allowed */
2446         if (from < le32_to_cpu(ex->ee_block) ||
2447             to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2448                 ext4_error(sbi->s_sb,
2449                            "strange request: removal(2) %u-%u from %u:%u",
2450                            from, to, le32_to_cpu(ex->ee_block), ee_len);
2451                 return 0;
2452         }
2453 
2454 #ifdef EXTENTS_STATS
2455         spin_lock(&sbi->s_ext_stats_lock);
2456         sbi->s_ext_blocks += ee_len;
2457         sbi->s_ext_extents++;
2458         if (ee_len < sbi->s_ext_min)
2459                 sbi->s_ext_min = ee_len;
2460         if (ee_len > sbi->s_ext_max)
2461                 sbi->s_ext_max = ee_len;
2462         if (ext_depth(inode) > sbi->s_depth_max)
2463                 sbi->s_depth_max = ext_depth(inode);
2464         spin_unlock(&sbi->s_ext_stats_lock);
2465 #endif
2466 
2467         trace_ext4_remove_blocks(inode, ex, from, to, partial);
2468 
2469         /*
2470          * if we have a partial cluster, and it's different from the
2471          * cluster of the last block in the extent, we free it
2472          */
2473         last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2474 
2475         if (partial->state != initial &&
2476             partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2477                 if (partial->state == tofree) {
2478                         flags = get_default_free_blocks_flags(inode);
2479                         if (ext4_is_pending(inode, partial->lblk))
2480                                 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2481                         ext4_free_blocks(handle, inode, NULL,
2482                                          EXT4_C2B(sbi, partial->pclu),
2483                                          sbi->s_cluster_ratio, flags);
2484                         if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2485                                 ext4_rereserve_cluster(inode, partial->lblk);
2486                 }
2487                 partial->state = initial;
2488         }
2489 
2490         num = le32_to_cpu(ex->ee_block) + ee_len - from;
2491         pblk = ext4_ext_pblock(ex) + ee_len - num;
2492 
2493         /*
2494          * We free the partial cluster at the end of the extent (if any),
2495          * unless the cluster is used by another extent (partial_cluster
2496          * state is nofree).  If a partial cluster exists here, it must be
2497          * shared with the last block in the extent.
2498          */
2499         flags = get_default_free_blocks_flags(inode);
2500 
2501         /* partial, left end cluster aligned, right end unaligned */
2502         if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2503             (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2504             (partial->state != nofree)) {
2505                 if (ext4_is_pending(inode, to))
2506                         flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2507                 ext4_free_blocks(handle, inode, NULL,
2508                                  EXT4_PBLK_CMASK(sbi, last_pblk),
2509                                  sbi->s_cluster_ratio, flags);
2510                 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2511                         ext4_rereserve_cluster(inode, to);
2512                 partial->state = initial;
2513                 flags = get_default_free_blocks_flags(inode);
2514         }
2515 
2516         flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2517 
2518         /*
2519          * For bigalloc file systems, we never free a partial cluster
2520          * at the beginning of the extent.  Instead, we check to see if we
2521          * need to free it on a subsequent call to ext4_remove_blocks,
2522          * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2523          */
2524         flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2525         ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2526 
2527         /* reset the partial cluster if we've freed past it */
2528         if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2529                 partial->state = initial;
2530 
2531         /*
2532          * If we've freed the entire extent but the beginning is not left
2533          * cluster aligned and is not marked as ineligible for freeing we
2534          * record the partial cluster at the beginning of the extent.  It
2535          * wasn't freed by the preceding ext4_free_blocks() call, and we
2536          * need to look farther to the left to determine if it's to be freed
2537          * (not shared with another extent). Else, reset the partial
2538          * cluster - we're either  done freeing or the beginning of the
2539          * extent is left cluster aligned.
2540          */
2541         if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2542                 if (partial->state == initial) {
2543                         partial->pclu = EXT4_B2C(sbi, pblk);
2544                         partial->lblk = from;
2545                         partial->state = tofree;
2546                 }
2547         } else {
2548                 partial->state = initial;
2549         }
2550 
2551         return 0;
2552 }
2553 
2554 /*
2555  * ext4_ext_rm_leaf() Removes the extents associated with the
2556  * blocks appearing between "start" and "end".  Both "start"
2557  * and "end" must appear in the same extent or EIO is returned.
2558  *
2559  * @handle: The journal handle
2560  * @inode:  The files inode
2561  * @path:   The path to the leaf
2562  * @partial_cluster: The cluster which we'll have to free if all extents
2563  *                   has been released from it.  However, if this value is
2564  *                   negative, it's a cluster just to the right of the
2565  *                   punched region and it must not be freed.
2566  * @start:  The first block to remove
2567  * @end:   The last block to remove
2568  */
2569 static int
2570 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2571                  struct ext4_ext_path *path,
2572                  struct partial_cluster *partial,
2573                  ext4_lblk_t start, ext4_lblk_t end)
2574 {
2575         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2576         int err = 0, correct_index = 0;
2577         int depth = ext_depth(inode), credits, revoke_credits;
2578         struct ext4_extent_header *eh;
2579         ext4_lblk_t a, b;
2580         unsigned num;
2581         ext4_lblk_t ex_ee_block;
2582         unsigned short ex_ee_len;
2583         unsigned unwritten = 0;
2584         struct ext4_extent *ex;
2585         ext4_fsblk_t pblk;
2586 
2587         /* the header must be checked already in ext4_ext_remove_space() */
2588         ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
2589         if (!path[depth].p_hdr)
2590                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2591         eh = path[depth].p_hdr;
2592         if (unlikely(path[depth].p_hdr == NULL)) {
2593                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2594                 return -EFSCORRUPTED;
2595         }
2596         /* find where to start removing */
2597         ex = path[depth].p_ext;
2598         if (!ex)
2599                 ex = EXT_LAST_EXTENT(eh);
2600 
2601         ex_ee_block = le32_to_cpu(ex->ee_block);
2602         ex_ee_len = ext4_ext_get_actual_len(ex);
2603 
2604         trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2605 
2606         while (ex >= EXT_FIRST_EXTENT(eh) &&
2607                         ex_ee_block + ex_ee_len > start) {
2608 
2609                 if (ext4_ext_is_unwritten(ex))
2610                         unwritten = 1;
2611                 else
2612                         unwritten = 0;
2613 
2614                 ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
2615                           unwritten, ex_ee_len);
2616                 path[depth].p_ext = ex;
2617 
2618                 a = max(ex_ee_block, start);
2619                 b = min(ex_ee_block + ex_ee_len - 1, end);
2620 
2621                 ext_debug(inode, "  border %u:%u\n", a, b);
2622 
2623                 /* If this extent is beyond the end of the hole, skip it */
2624                 if (end < ex_ee_block) {
2625                         /*
2626                          * We're going to skip this extent and move to another,
2627                          * so note that its first cluster is in use to avoid
2628                          * freeing it when removing blocks.  Eventually, the
2629                          * right edge of the truncated/punched region will
2630                          * be just to the left.
2631                          */
2632                         if (sbi->s_cluster_ratio > 1) {
2633                                 pblk = ext4_ext_pblock(ex);
2634                                 partial->pclu = EXT4_B2C(sbi, pblk);
2635                                 partial->state = nofree;
2636                         }
2637                         ex--;
2638                         ex_ee_block = le32_to_cpu(ex->ee_block);
2639                         ex_ee_len = ext4_ext_get_actual_len(ex);
2640                         continue;
2641                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2642                         EXT4_ERROR_INODE(inode,
2643                                          "can not handle truncate %u:%u "
2644                                          "on extent %u:%u",
2645                                          start, end, ex_ee_block,
2646                                          ex_ee_block + ex_ee_len - 1);
2647                         err = -EFSCORRUPTED;
2648                         goto out;
2649                 } else if (a != ex_ee_block) {
2650                         /* remove tail of the extent */
2651                         num = a - ex_ee_block;
2652                 } else {
2653                         /* remove whole extent: excellent! */
2654                         num = 0;
2655                 }
2656                 /*
2657                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2658                  * descriptor) for each block group; assume two block
2659                  * groups plus ex_ee_len/blocks_per_block_group for
2660                  * the worst case
2661                  */
2662                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2663                 if (ex == EXT_FIRST_EXTENT(eh)) {
2664                         correct_index = 1;
2665                         credits += (ext_depth(inode)) + 1;
2666                 }
2667                 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2668                 /*
2669                  * We may end up freeing some index blocks and data from the
2670                  * punched range. Note that partial clusters are accounted for
2671                  * by ext4_free_data_revoke_credits().
2672                  */
2673                 revoke_credits =
2674                         ext4_free_metadata_revoke_credits(inode->i_sb,
2675                                                           ext_depth(inode)) +
2676                         ext4_free_data_revoke_credits(inode, b - a + 1);
2677 
2678                 err = ext4_datasem_ensure_credits(handle, inode, credits,
2679                                                   credits, revoke_credits);
2680                 if (err) {
2681                         if (err > 0)
2682                                 err = -EAGAIN;
2683                         goto out;
2684                 }
2685 
2686                 err = ext4_ext_get_access(handle, inode, path + depth);
2687                 if (err)
2688                         goto out;
2689 
2690                 err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2691                 if (err)
2692                         goto out;
2693 
2694                 if (num == 0)
2695                         /* this extent is removed; mark slot entirely unused */
2696                         ext4_ext_store_pblock(ex, 0);
2697 
2698                 ex->ee_len = cpu_to_le16(num);
2699                 /*
2700                  * Do not mark unwritten if all the blocks in the
2701                  * extent have been removed.
2702                  */
2703                 if (unwritten && num)
2704                         ext4_ext_mark_unwritten(ex);
2705                 /*
2706                  * If the extent was completely released,
2707                  * we need to remove it from the leaf
2708                  */
2709                 if (num == 0) {
2710                         if (end != EXT_MAX_BLOCKS - 1) {
2711                                 /*
2712                                  * For hole punching, we need to scoot all the
2713                                  * extents up when an extent is removed so that
2714                                  * we dont have blank extents in the middle
2715                                  */
2716                                 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2717                                         sizeof(struct ext4_extent));
2718 
2719                                 /* Now get rid of the one at the end */
2720                                 memset(EXT_LAST_EXTENT(eh), 0,
2721                                         sizeof(struct ext4_extent));
2722                         }
2723                         le16_add_cpu(&eh->eh_entries, -1);
2724                 }
2725 
2726                 err = ext4_ext_dirty(handle, inode, path + depth);
2727                 if (err)
2728                         goto out;
2729 
2730                 ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
2731                                 ext4_ext_pblock(ex));
2732                 ex--;
2733                 ex_ee_block = le32_to_cpu(ex->ee_block);
2734                 ex_ee_len = ext4_ext_get_actual_len(ex);
2735         }
2736 
2737         if (correct_index && eh->eh_entries)
2738                 err = ext4_ext_correct_indexes(handle, inode, path);
2739 
2740         /*
2741          * If there's a partial cluster and at least one extent remains in
2742          * the leaf, free the partial cluster if it isn't shared with the
2743          * current extent.  If it is shared with the current extent
2744          * we reset the partial cluster because we've reached the start of the
2745          * truncated/punched region and we're done removing blocks.
2746          */
2747         if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2748                 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2749                 if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2750                         int flags = get_default_free_blocks_flags(inode);
2751 
2752                         if (ext4_is_pending(inode, partial->lblk))
2753                                 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2754                         ext4_free_blocks(handle, inode, NULL,
2755                                          EXT4_C2B(sbi, partial->pclu),
2756                                          sbi->s_cluster_ratio, flags);
2757                         if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2758                                 ext4_rereserve_cluster(inode, partial->lblk);
2759                 }
2760                 partial->state = initial;
2761         }
2762 
2763         /* if this leaf is free, then we should
2764          * remove it from index block above */
2765         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2766                 err = ext4_ext_rm_idx(handle, inode, path, depth);
2767 
2768 out:
2769         return err;
2770 }
2771 
2772 /*
2773  * ext4_ext_more_to_rm:
2774  * returns 1 if current index has to be freed (even partial)
2775  */
2776 static int
2777 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2778 {
2779         BUG_ON(path->p_idx == NULL);
2780 
2781         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2782                 return 0;
2783 
2784         /*
2785          * if truncate on deeper level happened, it wasn't partial,
2786          * so we have to consider current index for truncation
2787          */
2788         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2789                 return 0;
2790         return 1;
2791 }
2792 
2793 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2794                           ext4_lblk_t end)
2795 {
2796         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2797         int depth = ext_depth(inode);
2798         struct ext4_ext_path *path = NULL;
2799         struct partial_cluster partial;
2800         handle_t *handle;
2801         int i = 0, err = 0;
2802 
2803         partial.pclu = 0;
2804         partial.lblk = 0;
2805         partial.state = initial;
2806 
2807         ext_debug(inode, "truncate since %u to %u\n", start, end);
2808 
2809         /* probably first extent we're gonna free will be last in block */
2810         handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2811                         depth + 1,
2812                         ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2813         if (IS_ERR(handle))
2814                 return PTR_ERR(handle);
2815 
2816 again:
2817         trace_ext4_ext_remove_space(inode, start, end, depth);
2818 
2819         /*
2820          * Check if we are removing extents inside the extent tree. If that
2821          * is the case, we are going to punch a hole inside the extent tree
2822          * so we have to check whether we need to split the extent covering
2823          * the last block to remove so we can easily remove the part of it
2824          * in ext4_ext_rm_leaf().
2825          */
2826         if (end < EXT_MAX_BLOCKS - 1) {
2827                 struct ext4_extent *ex;
2828                 ext4_lblk_t ee_block, ex_end, lblk;
2829                 ext4_fsblk_t pblk;
2830 
2831                 /* find extent for or closest extent to this block */
2832                 path = ext4_find_extent(inode, end, NULL,
2833                                         EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
2834                 if (IS_ERR(path)) {
2835                         ext4_journal_stop(handle);
2836                         return PTR_ERR(path);
2837                 }
2838                 depth = ext_depth(inode);
2839                 /* Leaf not may not exist only if inode has no blocks at all */
2840                 ex = path[depth].p_ext;
2841                 if (!ex) {
2842                         if (depth) {
2843                                 EXT4_ERROR_INODE(inode,
2844                                                  "path[%d].p_hdr == NULL",
2845                                                  depth);
2846                                 err = -EFSCORRUPTED;
2847                         }
2848                         goto out;
2849                 }
2850 
2851                 ee_block = le32_to_cpu(ex->ee_block);
2852                 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2853 
2854                 /*
2855                  * See if the last block is inside the extent, if so split
2856                  * the extent at 'end' block so we can easily remove the
2857                  * tail of the first part of the split extent in
2858                  * ext4_ext_rm_leaf().
2859                  */
2860                 if (end >= ee_block && end < ex_end) {
2861 
2862                         /*
2863                          * If we're going to split the extent, note that
2864                          * the cluster containing the block after 'end' is
2865                          * in use to avoid freeing it when removing blocks.
2866                          */
2867                         if (sbi->s_cluster_ratio > 1) {
2868                                 pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
2869                                 partial.pclu = EXT4_B2C(sbi, pblk);
2870                                 partial.state = nofree;
2871                         }
2872 
2873                         /*
2874                          * Split the extent in two so that 'end' is the last
2875                          * block in the first new extent. Also we should not
2876                          * fail removing space due to ENOSPC so try to use
2877                          * reserved block if that happens.
2878                          */
2879                         err = ext4_force_split_extent_at(handle, inode, &path,
2880                                                          end + 1, 1);
2881                         if (err < 0)
2882                                 goto out;
2883 
2884                 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
2885                            partial.state == initial) {
2886                         /*
2887                          * If we're punching, there's an extent to the right.
2888                          * If the partial cluster hasn't been set, set it to
2889                          * that extent's first cluster and its state to nofree
2890                          * so it won't be freed should it contain blocks to be
2891                          * removed. If it's already set (tofree/nofree), we're
2892                          * retrying and keep the original partial cluster info
2893                          * so a cluster marked tofree as a result of earlier
2894                          * extent removal is not lost.
2895                          */
2896                         lblk = ex_end + 1;
2897                         err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2898                                                     NULL);
2899                         if (err < 0)
2900                                 goto out;
2901                         if (pblk) {
2902                                 partial.pclu = EXT4_B2C(sbi, pblk);
2903                                 partial.state = nofree;
2904                         }
2905                 }
2906         }
2907         /*
2908          * We start scanning from right side, freeing all the blocks
2909          * after i_size and walking into the tree depth-wise.
2910          */
2911         depth = ext_depth(inode);
2912         if (path) {
2913                 int k = i = depth;
2914                 while (--k > 0)
2915                         path[k].p_block =
2916                                 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2917         } else {
2918                 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
2919                                GFP_NOFS | __GFP_NOFAIL);
2920                 if (path == NULL) {
2921                         ext4_journal_stop(handle);
2922                         return -ENOMEM;
2923                 }
2924                 path[0].p_maxdepth = path[0].p_depth = depth;
2925                 path[0].p_hdr = ext_inode_hdr(inode);
2926                 i = 0;
2927 
2928                 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2929                         err = -EFSCORRUPTED;
2930                         goto out;
2931                 }
2932         }
2933         err = 0;
2934 
2935         while (i >= 0 && err == 0) {
2936                 if (i == depth) {
2937                         /* this is leaf block */
2938                         err = ext4_ext_rm_leaf(handle, inode, path,
2939                                                &partial, start, end);
2940                         /* root level has p_bh == NULL, brelse() eats this */
2941                         brelse(path[i].p_bh);
2942                         path[i].p_bh = NULL;
2943                         i--;
2944                         continue;
2945                 }
2946 
2947                 /* this is index block */
2948                 if (!path[i].p_hdr) {
2949                         ext_debug(inode, "initialize header\n");
2950                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2951                 }
2952 
2953                 if (!path[i].p_idx) {
2954                         /* this level hasn't been touched yet */
2955                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2956                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2957                         ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
2958                                   path[i].p_hdr,
2959                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2960                 } else {
2961                         /* we were already here, see at next index */
2962                         path[i].p_idx--;
2963                 }
2964 
2965                 ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
2966                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2967                                 path[i].p_idx);
2968                 if (ext4_ext_more_to_rm(path + i)) {
2969                         struct buffer_head *bh;
2970                         /* go to the next level */
2971                         ext_debug(inode, "move to level %d (block %llu)\n",
2972                                   i + 1, ext4_idx_pblock(path[i].p_idx));
2973                         memset(path + i + 1, 0, sizeof(*path));
2974                         bh = read_extent_tree_block(inode, path[i].p_idx,
2975                                                     depth - i - 1,
2976                                                     EXT4_EX_NOCACHE);
2977                         if (IS_ERR(bh)) {
2978                                 /* should we reset i_size? */
2979                                 err = PTR_ERR(bh);
2980                                 break;
2981                         }
2982                         /* Yield here to deal with large extent trees.
2983                          * Should be a no-op if we did IO above. */
2984                         cond_resched();
2985                         if (WARN_ON(i + 1 > depth)) {
2986                                 err = -EFSCORRUPTED;
2987                                 break;
2988                         }
2989                         path[i + 1].p_bh = bh;
2990 
2991                         /* save actual number of indexes since this
2992                          * number is changed at the next iteration */
2993                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2994                         i++;
2995                 } else {
2996                         /* we finished processing this index, go up */
2997                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2998                                 /* index is empty, remove it;
2999                                  * handle must be already prepared by the
3000                                  * truncatei_leaf() */
3001                                 err = ext4_ext_rm_idx(handle, inode, path, i);
3002                         }
3003                         /* root level has p_bh == NULL, brelse() eats this */
3004                         brelse(path[i].p_bh);
3005                         path[i].p_bh = NULL;
3006                         i--;
3007                         ext_debug(inode, "return to level %d\n", i);
3008                 }
3009         }
3010 
3011         trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3012                                          path->p_hdr->eh_entries);
3013 
3014         /*
3015          * if there's a partial cluster and we have removed the first extent
3016          * in the file, then we also free the partial cluster, if any
3017          */
3018         if (partial.state == tofree && err == 0) {
3019                 int flags = get_default_free_blocks_flags(inode);
3020 
3021                 if (ext4_is_pending(inode, partial.lblk))
3022                         flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3023                 ext4_free_blocks(handle, inode, NULL,
3024                                  EXT4_C2B(sbi, partial.pclu),
3025                                  sbi->s_cluster_ratio, flags);
3026                 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3027                         ext4_rereserve_cluster(inode, partial.lblk);
3028                 partial.state = initial;
3029         }
3030 
3031         /* TODO: flexible tree reduction should be here */
3032         if (path->p_hdr->eh_entries == 0) {
3033                 /*
3034                  * truncate to zero freed all the tree,
3035                  * so we need to correct eh_depth
3036                  */
3037                 err = ext4_ext_get_access(handle, inode, path);
3038                 if (err == 0) {
3039                         ext_inode_hdr(inode)->eh_depth = 0;
3040                         ext_inode_hdr(inode)->eh_max =
3041                                 cpu_to_le16(ext4_ext_space_root(inode, 0));
3042                         err = ext4_ext_dirty(handle, inode, path);
3043                 }
3044         }
3045 out:
3046         ext4_free_ext_path(path);
3047         path = NULL;
3048         if (err == -EAGAIN)
3049                 goto again;
3050         ext4_journal_stop(handle);
3051 
3052         return err;
3053 }
3054 
3055 /*
3056  * called at mount time
3057  */
3058 void ext4_ext_init(struct super_block *sb)
3059 {
3060         /*
3061          * possible initialization would be here
3062          */
3063 
3064         if (ext4_has_feature_extents(sb)) {
3065 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3066                 printk(KERN_INFO "EXT4-fs: file extents enabled"
3067 #ifdef AGGRESSIVE_TEST
3068                        ", aggressive tests"
3069 #endif
3070 #ifdef CHECK_BINSEARCH
3071                        ", check binsearch"
3072 #endif
3073 #ifdef EXTENTS_STATS
3074                        ", stats"
3075 #endif
3076                        "\n");
3077 #endif
3078 #ifdef EXTENTS_STATS
3079                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3080                 EXT4_SB(sb)->s_ext_min = 1 << 30;
3081                 EXT4_SB(sb)->s_ext_max = 0;
3082 #endif
3083         }
3084 }
3085 
3086 /*
3087  * called at umount time
3088  */
3089 void ext4_ext_release(struct super_block *sb)
3090 {
3091         if (!ext4_has_feature_extents(sb))
3092                 return;
3093 
3094 #ifdef EXTENTS_STATS
3095         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3096                 struct ext4_sb_info *sbi = EXT4_SB(sb);
3097                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3098                         sbi->s_ext_blocks, sbi->s_ext_extents,
3099                         sbi->s_ext_blocks / sbi->s_ext_extents);
3100                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3101                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3102         }
3103 #endif
3104 }
3105 
3106 static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3107 {
3108         ext4_lblk_t  ee_block;
3109         ext4_fsblk_t ee_pblock;
3110         unsigned int ee_len;
3111 
3112         ee_block  = le32_to_cpu(ex->ee_block);
3113         ee_len    = ext4_ext_get_actual_len(ex);
3114         ee_pblock = ext4_ext_pblock(ex);
3115 
3116         if (ee_len == 0)
3117                 return;
3118 
3119         ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3120                               EXTENT_STATUS_WRITTEN);
3121 }
3122 
3123 /* FIXME!! we need to try to merge to left or right after zero-out  */
3124 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3125 {
3126         ext4_fsblk_t ee_pblock;
3127         unsigned int ee_len;
3128 
3129         ee_len    = ext4_ext_get_actual_len(ex);
3130         ee_pblock = ext4_ext_pblock(ex);
3131         return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3132                                   ee_len);
3133 }
3134 
3135 /*
3136  * ext4_split_extent_at() splits an extent at given block.
3137  *
3138  * @handle: the journal handle
3139  * @inode: the file inode
3140  * @path: the path to the extent
3141  * @split: the logical block where the extent is splitted.
3142  * @split_flags: indicates if the extent could be zeroout if split fails, and
3143  *               the states(init or unwritten) of new extents.
3144  * @flags: flags used to insert new extent to extent tree.
3145  *
3146  *
3147  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3148  * of which are determined by split_flag.
3149  *
3150  * There are two cases:
3151  *  a> the extent are splitted into two extent.
3152  *  b> split is not needed, and just mark the extent.
3153  *
3154  * return 0 on success.
3155  */
3156 static int ext4_split_extent_at(handle_t *handle,
3157                              struct inode *inode,
3158                              struct ext4_ext_path **ppath,
3159                              ext4_lblk_t split,
3160                              int split_flag,
3161                              int flags)
3162 {
3163         struct ext4_ext_path *path = *ppath;
3164         ext4_fsblk_t newblock;
3165         ext4_lblk_t ee_block;
3166         struct ext4_extent *ex, newex, orig_ex, zero_ex;
3167         struct ext4_extent *ex2 = NULL;
3168         unsigned int ee_len, depth;
3169         int err = 0;
3170 
3171         BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3172                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3173 
3174         ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
3175 
3176         ext4_ext_show_leaf(inode, path);
3177 
3178         depth = ext_depth(inode);
3179         ex = path[depth].p_ext;
3180         ee_block = le32_to_cpu(ex->ee_block);
3181         ee_len = ext4_ext_get_actual_len(ex);
3182         newblock = split - ee_block + ext4_ext_pblock(ex);
3183 
3184         BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3185         BUG_ON(!ext4_ext_is_unwritten(ex) &&
3186                split_flag & (EXT4_EXT_MAY_ZEROOUT |
3187                              EXT4_EXT_MARK_UNWRIT1 |
3188                              EXT4_EXT_MARK_UNWRIT2));
3189 
3190         err = ext4_ext_get_access(handle, inode, path + depth);
3191         if (err)
3192                 goto out;
3193 
3194         if (split == ee_block) {
3195                 /*
3196                  * case b: block @split is the block that the extent begins with
3197                  * then we just change the state of the extent, and splitting
3198                  * is not needed.
3199                  */
3200                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3201                         ext4_ext_mark_unwritten(ex);
3202                 else
3203                         ext4_ext_mark_initialized(ex);
3204 
3205                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3206                         ext4_ext_try_to_merge(handle, inode, path, ex);
3207 
3208                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3209                 goto out;
3210         }
3211 
3212         /* case a */
3213         memcpy(&orig_ex, ex, sizeof(orig_ex));
3214         ex->ee_len = cpu_to_le16(split - ee_block);
3215         if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3216                 ext4_ext_mark_unwritten(ex);
3217 
3218         /*
3219          * path may lead to new leaf, not to original leaf any more
3220          * after ext4_ext_insert_extent() returns,
3221          */
3222         err = ext4_ext_dirty(handle, inode, path + depth);
3223         if (err)
3224                 goto fix_extent_len;
3225 
3226         ex2 = &newex;
3227         ex2->ee_block = cpu_to_le32(split);
3228         ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3229         ext4_ext_store_pblock(ex2, newblock);
3230         if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3231                 ext4_ext_mark_unwritten(ex2);
3232 
3233         err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3234         if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
3235                 goto out;
3236 
3237         /*
3238          * Update path is required because previous ext4_ext_insert_extent()
3239          * may have freed or reallocated the path. Using EXT4_EX_NOFAIL
3240          * guarantees that ext4_find_extent() will not return -ENOMEM,
3241          * otherwise -ENOMEM will cause a retry in do_writepages(), and a
3242          * WARN_ON may be triggered in ext4_da_update_reserve_space() due to
3243          * an incorrect ee_len causing the i_reserved_data_blocks exception.
3244          */
3245         path = ext4_find_extent(inode, ee_block, ppath,
3246                                 flags | EXT4_EX_NOFAIL);
3247         if (IS_ERR(path)) {
3248                 EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
3249                                  split, PTR_ERR(path));
3250                 return PTR_ERR(path);
3251         }
3252         depth = ext_depth(inode);
3253         ex = path[depth].p_ext;
3254 
3255         if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
3256                 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3257                         if (split_flag & EXT4_EXT_DATA_VALID1) {
3258                                 err = ext4_ext_zeroout(inode, ex2);
3259                                 zero_ex.ee_block = ex2->ee_block;
3260                                 zero_ex.ee_len = cpu_to_le16(
3261                                                 ext4_ext_get_actual_len(ex2));
3262                                 ext4_ext_store_pblock(&zero_ex,
3263                                                       ext4_ext_pblock(ex2));
3264                         } else {
3265                                 err = ext4_ext_zeroout(inode, ex);
3266                                 zero_ex.ee_block = ex->ee_block;
3267                                 zero_ex.ee_len = cpu_to_le16(
3268                                                 ext4_ext_get_actual_len(ex));
3269                                 ext4_ext_store_pblock(&zero_ex,
3270                                                       ext4_ext_pblock(ex));
3271                         }
3272                 } else {
3273                         err = ext4_ext_zeroout(inode, &orig_ex);
3274                         zero_ex.ee_block = orig_ex.ee_block;
3275                         zero_ex.ee_len = cpu_to_le16(
3276                                                 ext4_ext_get_actual_len(&orig_ex));
3277                         ext4_ext_store_pblock(&zero_ex,
3278                                               ext4_ext_pblock(&orig_ex));
3279                 }
3280 
3281                 if (!err) {
3282                         /* update the extent length and mark as initialized */
3283                         ex->ee_len = cpu_to_le16(ee_len);
3284                         ext4_ext_try_to_merge(handle, inode, path, ex);
3285                         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3286                         if (!err)
3287                                 /* update extent status tree */
3288                                 ext4_zeroout_es(inode, &zero_ex);
3289                         /* If we failed at this point, we don't know in which
3290                          * state the extent tree exactly is so don't try to fix
3291                          * length of the original extent as it may do even more
3292                          * damage.
3293                          */
3294                         goto out;
3295                 }
3296         }
3297 
3298 fix_extent_len:
3299         ex->ee_len = orig_ex.ee_len;
3300         /*
3301          * Ignore ext4_ext_dirty return value since we are already in error path
3302          * and err is a non-zero error code.
3303          */
3304         ext4_ext_dirty(handle, inode, path + path->p_depth);
3305         return err;
3306 out:
3307         ext4_ext_show_leaf(inode, *ppath);
3308         return err;
3309 }
3310 
3311 /*
3312  * ext4_split_extent() splits an extent and mark extent which is covered
3313  * by @map as split_flags indicates
3314  *
3315  * It may result in splitting the extent into multiple extents (up to three)
3316  * There are three possibilities:
3317  *   a> There is no split required
3318  *   b> Splits in two extents: Split is happening at either end of the extent
3319  *   c> Splits in three extents: Somone is splitting in middle of the extent
3320  *
3321  */
3322 static int ext4_split_extent(handle_t *handle,
3323                               struct inode *inode,
3324                               struct ext4_ext_path **ppath,
3325                               struct ext4_map_blocks *map,
3326                               int split_flag,
3327                               int flags)
3328 {
3329         struct ext4_ext_path *path = *ppath;
3330         ext4_lblk_t ee_block;
3331         struct ext4_extent *ex;
3332         unsigned int ee_len, depth;
3333         int err = 0;
3334         int unwritten;
3335         int split_flag1, flags1;
3336         int allocated = map->m_len;
3337 
3338         depth = ext_depth(inode);
3339         ex = path[depth].p_ext;
3340         ee_block = le32_to_cpu(ex->ee_block);
3341         ee_len = ext4_ext_get_actual_len(ex);
3342         unwritten = ext4_ext_is_unwritten(ex);
3343 
3344         if (map->m_lblk + map->m_len < ee_block + ee_len) {
3345                 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3346                 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3347                 if (unwritten)
3348                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3349                                        EXT4_EXT_MARK_UNWRIT2;
3350                 if (split_flag & EXT4_EXT_DATA_VALID2)
3351                         split_flag1 |= EXT4_EXT_DATA_VALID1;
3352                 err = ext4_split_extent_at(handle, inode, ppath,
3353                                 map->m_lblk + map->m_len, split_flag1, flags1);
3354                 if (err)
3355                         goto out;
3356         } else {
3357                 allocated = ee_len - (map->m_lblk - ee_block);
3358         }
3359         /*
3360          * Update path is required because previous ext4_split_extent_at() may
3361          * result in split of original leaf or extent zeroout.
3362          */
3363         path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
3364         if (IS_ERR(path))
3365                 return PTR_ERR(path);
3366         depth = ext_depth(inode);
3367         ex = path[depth].p_ext;
3368         if (!ex) {
3369                 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3370                                  (unsigned long) map->m_lblk);
3371                 return -EFSCORRUPTED;
3372         }
3373         unwritten = ext4_ext_is_unwritten(ex);
3374 
3375         if (map->m_lblk >= ee_block) {
3376                 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3377                 if (unwritten) {
3378                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3379                         split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3380                                                      EXT4_EXT_MARK_UNWRIT2);
3381                 }
3382                 err = ext4_split_extent_at(handle, inode, ppath,
3383                                 map->m_lblk, split_flag1, flags);
3384                 if (err)
3385                         goto out;
3386         }
3387 
3388         ext4_ext_show_leaf(inode, *ppath);
3389 out:
3390         return err ? err : allocated;
3391 }
3392 
3393 /*
3394  * This function is called by ext4_ext_map_blocks() if someone tries to write
3395  * to an unwritten extent. It may result in splitting the unwritten
3396  * extent into multiple extents (up to three - one initialized and two
3397  * unwritten).
3398  * There are three possibilities:
3399  *   a> There is no split required: Entire extent should be initialized
3400  *   b> Splits in two extents: Write is happening at either end of the extent
3401  *   c> Splits in three extents: Somone is writing in middle of the extent
3402  *
3403  * Pre-conditions:
3404  *  - The extent pointed to by 'path' is unwritten.
3405  *  - The extent pointed to by 'path' contains a superset
3406  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3407  *
3408  * Post-conditions on success:
3409  *  - the returned value is the number of blocks beyond map->l_lblk
3410  *    that are allocated and initialized.
3411  *    It is guaranteed to be >= map->m_len.
3412  */
3413 static int ext4_ext_convert_to_initialized(handle_t *handle,
3414                                            struct inode *inode,
3415                                            struct ext4_map_blocks *map,
3416                                            struct ext4_ext_path **ppath,
3417                                            int flags)
3418 {
3419         struct ext4_ext_path *path = *ppath;
3420         struct ext4_sb_info *sbi;
3421         struct ext4_extent_header *eh;
3422         struct ext4_map_blocks split_map;
3423         struct ext4_extent zero_ex1, zero_ex2;
3424         struct ext4_extent *ex, *abut_ex;
3425         ext4_lblk_t ee_block, eof_block;
3426         unsigned int ee_len, depth, map_len = map->m_len;
3427         int err = 0;
3428         int split_flag = EXT4_EXT_DATA_VALID2;
3429         int allocated = 0;
3430         unsigned int max_zeroout = 0;
3431 
3432         ext_debug(inode, "logical block %llu, max_blocks %u\n",
3433                   (unsigned long long)map->m_lblk, map_len);
3434 
3435         sbi = EXT4_SB(inode->i_sb);
3436         eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3437                         >> inode->i_sb->s_blocksize_bits;
3438         if (eof_block < map->m_lblk + map_len)
3439                 eof_block = map->m_lblk + map_len;
3440 
3441         depth = ext_depth(inode);
3442         eh = path[depth].p_hdr;
3443         ex = path[depth].p_ext;
3444         ee_block = le32_to_cpu(ex->ee_block);
3445         ee_len = ext4_ext_get_actual_len(ex);
3446         zero_ex1.ee_len = 0;
3447         zero_ex2.ee_len = 0;
3448 
3449         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3450 
3451         /* Pre-conditions */
3452         BUG_ON(!ext4_ext_is_unwritten(ex));
3453         BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3454 
3455         /*
3456          * Attempt to transfer newly initialized blocks from the currently
3457          * unwritten extent to its neighbor. This is much cheaper
3458          * than an insertion followed by a merge as those involve costly
3459          * memmove() calls. Transferring to the left is the common case in
3460          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3461          * followed by append writes.
3462          *
3463          * Limitations of the current logic:
3464          *  - L1: we do not deal with writes covering the whole extent.
3465          *    This would require removing the extent if the transfer
3466          *    is possible.
3467          *  - L2: we only attempt to merge with an extent stored in the
3468          *    same extent tree node.
3469          */
3470         if ((map->m_lblk == ee_block) &&
3471                 /* See if we can merge left */
3472                 (map_len < ee_len) &&           /*L1*/
3473                 (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3474                 ext4_lblk_t prev_lblk;
3475                 ext4_fsblk_t prev_pblk, ee_pblk;
3476                 unsigned int prev_len;
3477 
3478                 abut_ex = ex - 1;
3479                 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3480                 prev_len = ext4_ext_get_actual_len(abut_ex);
3481                 prev_pblk = ext4_ext_pblock(abut_ex);
3482                 ee_pblk = ext4_ext_pblock(ex);
3483 
3484                 /*
3485                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3486                  * upon those conditions:
3487                  * - C1: abut_ex is initialized,
3488                  * - C2: abut_ex is logically abutting ex,
3489                  * - C3: abut_ex is physically abutting ex,
3490                  * - C4: abut_ex can receive the additional blocks without
3491                  *   overflowing the (initialized) length limit.
3492                  */
3493                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3494                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3495                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3496                         (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3497                         err = ext4_ext_get_access(handle, inode, path + depth);
3498                         if (err)
3499                                 goto out;
3500 
3501                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3502                                 map, ex, abut_ex);
3503 
3504                         /* Shift the start of ex by 'map_len' blocks */
3505                         ex->ee_block = cpu_to_le32(ee_block + map_len);
3506                         ext4_ext_store_pblock(ex, ee_pblk + map_len);
3507                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3508                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3509 
3510                         /* Extend abut_ex by 'map_len' blocks */
3511                         abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3512 
3513                         /* Result: number of initialized blocks past m_lblk */
3514                         allocated = map_len;
3515                 }
3516         } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3517                    (map_len < ee_len) &&        /*L1*/
3518                    ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3519                 /* See if we can merge right */
3520                 ext4_lblk_t next_lblk;
3521                 ext4_fsblk_t next_pblk, ee_pblk;
3522                 unsigned int next_len;
3523 
3524                 abut_ex = ex + 1;
3525                 next_lblk = le32_to_cpu(abut_ex->ee_block);
3526                 next_len = ext4_ext_get_actual_len(abut_ex);
3527                 next_pblk = ext4_ext_pblock(abut_ex);
3528                 ee_pblk = ext4_ext_pblock(ex);
3529 
3530                 /*
3531                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3532                  * upon those conditions:
3533                  * - C1: abut_ex is initialized,
3534                  * - C2: abut_ex is logically abutting ex,
3535                  * - C3: abut_ex is physically abutting ex,
3536                  * - C4: abut_ex can receive the additional blocks without
3537                  *   overflowing the (initialized) length limit.
3538                  */
3539                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3540                     ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3541                     ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3542                     (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3543                         err = ext4_ext_get_access(handle, inode, path + depth);
3544                         if (err)
3545                                 goto out;
3546 
3547                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3548                                 map, ex, abut_ex);
3549 
3550                         /* Shift the start of abut_ex by 'map_len' blocks */
3551                         abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3552                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3553                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3554                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3555 
3556                         /* Extend abut_ex by 'map_len' blocks */
3557                         abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3558 
3559                         /* Result: number of initialized blocks past m_lblk */
3560                         allocated = map_len;
3561                 }
3562         }
3563         if (allocated) {
3564                 /* Mark the block containing both extents as dirty */
3565                 err = ext4_ext_dirty(handle, inode, path + depth);
3566 
3567                 /* Update path to point to the right extent */
3568                 path[depth].p_ext = abut_ex;
3569                 goto out;
3570         } else
3571                 allocated = ee_len - (map->m_lblk - ee_block);
3572 
3573         WARN_ON(map->m_lblk < ee_block);
3574         /*
3575          * It is safe to convert extent to initialized via explicit
3576          * zeroout only if extent is fully inside i_size or new_size.
3577          */
3578         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3579 
3580         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3581                 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3582                         (inode->i_sb->s_blocksize_bits - 10);
3583 
3584         /*
3585          * five cases:
3586          * 1. split the extent into three extents.
3587          * 2. split the extent into two extents, zeroout the head of the first
3588          *    extent.
3589          * 3. split the extent into two extents, zeroout the tail of the second
3590          *    extent.
3591          * 4. split the extent into two extents with out zeroout.
3592          * 5. no splitting needed, just possibly zeroout the head and / or the
3593          *    tail of the extent.
3594          */
3595         split_map.m_lblk = map->m_lblk;
3596         split_map.m_len = map->m_len;
3597 
3598         if (max_zeroout && (allocated > split_map.m_len)) {
3599                 if (allocated <= max_zeroout) {
3600                         /* case 3 or 5 */
3601                         zero_ex1.ee_block =
3602                                  cpu_to_le32(split_map.m_lblk +
3603                                              split_map.m_len);
3604                         zero_ex1.ee_len =
3605                                 cpu_to_le16(allocated - split_map.m_len);
3606                         ext4_ext_store_pblock(&zero_ex1,
3607                                 ext4_ext_pblock(ex) + split_map.m_lblk +
3608                                 split_map.m_len - ee_block);
3609                         err = ext4_ext_zeroout(inode, &zero_ex1);
3610                         if (err)
3611                                 goto fallback;
3612                         split_map.m_len = allocated;
3613                 }
3614                 if (split_map.m_lblk - ee_block + split_map.m_len <
3615                                                                 max_zeroout) {
3616                         /* case 2 or 5 */
3617                         if (split_map.m_lblk != ee_block) {
3618                                 zero_ex2.ee_block = ex->ee_block;
3619                                 zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3620                                                         ee_block);
3621                                 ext4_ext_store_pblock(&zero_ex2,
3622                                                       ext4_ext_pblock(ex));
3623                                 err = ext4_ext_zeroout(inode, &zero_ex2);
3624                                 if (err)
3625                                         goto fallback;
3626                         }
3627 
3628                         split_map.m_len += split_map.m_lblk - ee_block;
3629                         split_map.m_lblk = ee_block;
3630                         allocated = map->m_len;
3631                 }
3632         }
3633 
3634 fallback:
3635         err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3636                                 flags);
3637         if (err > 0)
3638                 err = 0;
3639 out:
3640         /* If we have gotten a failure, don't zero out status tree */
3641         if (!err) {
3642                 ext4_zeroout_es(inode, &zero_ex1);
3643                 ext4_zeroout_es(inode, &zero_ex2);
3644         }
3645         return err ? err : allocated;
3646 }
3647 
3648 /*
3649  * This function is called by ext4_ext_map_blocks() from
3650  * ext4_get_blocks_dio_write() when DIO to write
3651  * to an unwritten extent.
3652  *
3653  * Writing to an unwritten extent may result in splitting the unwritten
3654  * extent into multiple initialized/unwritten extents (up to three)
3655  * There are three possibilities:
3656  *   a> There is no split required: Entire extent should be unwritten
3657  *   b> Splits in two extents: Write is happening at either end of the extent
3658  *   c> Splits in three extents: Somone is writing in middle of the extent
3659  *
3660  * This works the same way in the case of initialized -> unwritten conversion.
3661  *
3662  * One of more index blocks maybe needed if the extent tree grow after
3663  * the unwritten extent split. To prevent ENOSPC occur at the IO
3664  * complete, we need to split the unwritten extent before DIO submit
3665  * the IO. The unwritten extent called at this time will be split
3666  * into three unwritten extent(at most). After IO complete, the part
3667  * being filled will be convert to initialized by the end_io callback function
3668  * via ext4_convert_unwritten_extents().
3669  *
3670  * Returns the size of unwritten extent to be written on success.
3671  */
3672 static int ext4_split_convert_extents(handle_t *handle,
3673                                         struct inode *inode,
3674                                         struct ext4_map_blocks *map,
3675                                         struct ext4_ext_path **ppath,
3676                                         int flags)
3677 {
3678         struct ext4_ext_path *path = *ppath;
3679         ext4_lblk_t eof_block;
3680         ext4_lblk_t ee_block;
3681         struct ext4_extent *ex;
3682         unsigned int ee_len;
3683         int split_flag = 0, depth;
3684 
3685         ext_debug(inode, "logical block %llu, max_blocks %u\n",
3686                   (unsigned long long)map->m_lblk, map->m_len);
3687 
3688         eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3689                         >> inode->i_sb->s_blocksize_bits;
3690         if (eof_block < map->m_lblk + map->m_len)
3691                 eof_block = map->m_lblk + map->m_len;
3692         /*
3693          * It is safe to convert extent to initialized via explicit
3694          * zeroout only if extent is fully inside i_size or new_size.
3695          */
3696         depth = ext_depth(inode);
3697         ex = path[depth].p_ext;
3698         ee_block = le32_to_cpu(ex->ee_block);
3699         ee_len = ext4_ext_get_actual_len(ex);
3700 
3701         /* Convert to unwritten */
3702         if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3703                 split_flag |= EXT4_EXT_DATA_VALID1;
3704         /* Convert to initialized */
3705         } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3706                 split_flag |= ee_block + ee_len <= eof_block ?
3707                               EXT4_EXT_MAY_ZEROOUT : 0;
3708                 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3709         }
3710         flags |= EXT4_GET_BLOCKS_PRE_IO;
3711         return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3712 }
3713 
3714 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3715                                                 struct inode *inode,
3716                                                 struct ext4_map_blocks *map,
3717                                                 struct ext4_ext_path **ppath)
3718 {
3719         struct ext4_ext_path *path = *ppath;
3720         struct ext4_extent *ex;
3721         ext4_lblk_t ee_block;
3722         unsigned int ee_len;
3723         int depth;
3724         int err = 0;
3725 
3726         depth = ext_depth(inode);
3727         ex = path[depth].p_ext;
3728         ee_block = le32_to_cpu(ex->ee_block);
3729         ee_len = ext4_ext_get_actual_len(ex);
3730 
3731         ext_debug(inode, "logical block %llu, max_blocks %u\n",
3732                   (unsigned long long)ee_block, ee_len);
3733 
3734         /* If extent is larger than requested it is a clear sign that we still
3735          * have some extent state machine issues left. So extent_split is still
3736          * required.
3737          * TODO: Once all related issues will be fixed this situation should be
3738          * illegal.
3739          */
3740         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3741 #ifdef CONFIG_EXT4_DEBUG
3742                 ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3743                              " len %u; IO logical block %llu, len %u",
3744                              inode->i_ino, (unsigned long long)ee_block, ee_len,
3745                              (unsigned long long)map->m_lblk, map->m_len);
3746 #endif
3747                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3748                                                  EXT4_GET_BLOCKS_CONVERT);
3749                 if (err < 0)
3750                         return err;
3751                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3752                 if (IS_ERR(path))
3753                         return PTR_ERR(path);
3754                 depth = ext_depth(inode);
3755                 ex = path[depth].p_ext;
3756         }
3757 
3758         err = ext4_ext_get_access(handle, inode, path + depth);
3759         if (err)
3760                 goto out;
3761         /* first mark the extent as initialized */
3762         ext4_ext_mark_initialized(ex);
3763 
3764         /* note: ext4_ext_correct_indexes() isn't needed here because
3765          * borders are not changed
3766          */
3767         ext4_ext_try_to_merge(handle, inode, path, ex);
3768 
3769         /* Mark modified extent as dirty */
3770         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3771 out:
3772         ext4_ext_show_leaf(inode, path);
3773         return err;
3774 }
3775 
3776 static int
3777 convert_initialized_extent(handle_t *handle, struct inode *inode,
3778                            struct ext4_map_blocks *map,
3779                            struct ext4_ext_path **ppath,
3780                            unsigned int *allocated)
3781 {
3782         struct ext4_ext_path *path = *ppath;
3783         struct ext4_extent *ex;
3784         ext4_lblk_t ee_block;
3785         unsigned int ee_len;
3786         int depth;
3787         int err = 0;
3788 
3789         /*
3790          * Make sure that the extent is no bigger than we support with
3791          * unwritten extent
3792          */
3793         if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3794                 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3795 
3796         depth = ext_depth(inode);
3797         ex = path[depth].p_ext;
3798         ee_block = le32_to_cpu(ex->ee_block);
3799         ee_len = ext4_ext_get_actual_len(ex);
3800 
3801         ext_debug(inode, "logical block %llu, max_blocks %u\n",
3802                   (unsigned long long)ee_block, ee_len);
3803 
3804         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3805                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3806                                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3807                 if (err < 0)
3808                         return err;
3809                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3810                 if (IS_ERR(path))
3811                         return PTR_ERR(path);
3812                 depth = ext_depth(inode);
3813                 ex = path[depth].p_ext;
3814                 if (!ex) {
3815                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3816                                          (unsigned long) map->m_lblk);
3817                         return -EFSCORRUPTED;
3818                 }
3819         }
3820 
3821         err = ext4_ext_get_access(handle, inode, path + depth);
3822         if (err)
3823                 return err;
3824         /* first mark the extent as unwritten */
3825         ext4_ext_mark_unwritten(ex);
3826 
3827         /* note: ext4_ext_correct_indexes() isn't needed here because
3828          * borders are not changed
3829          */
3830         ext4_ext_try_to_merge(handle, inode, path, ex);
3831 
3832         /* Mark modified extent as dirty */
3833         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3834         if (err)
3835                 return err;
3836         ext4_ext_show_leaf(inode, path);
3837 
3838         ext4_update_inode_fsync_trans(handle, inode, 1);
3839 
3840         map->m_flags |= EXT4_MAP_UNWRITTEN;
3841         if (*allocated > map->m_len)
3842                 *allocated = map->m_len;
3843         map->m_len = *allocated;
3844         return 0;
3845 }
3846 
3847 static int
3848 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3849                         struct ext4_map_blocks *map,
3850                         struct ext4_ext_path **ppath, int flags,
3851                         unsigned int allocated, ext4_fsblk_t newblock)
3852 {
3853         int ret = 0;
3854         int err = 0;
3855 
3856         ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
3857                   (unsigned long long)map->m_lblk, map->m_len, flags,
3858                   allocated);
3859         ext4_ext_show_leaf(inode, *ppath);
3860 
3861         /*
3862          * When writing into unwritten space, we should not fail to
3863          * allocate metadata blocks for the new extent block if needed.
3864          */
3865         flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3866 
3867         trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
3868                                                     allocated, newblock);
3869 
3870         /* get_block() before submitting IO, split the extent */
3871         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
3872                 ret = ext4_split_convert_extents(handle, inode, map, ppath,
3873                                          flags | EXT4_GET_BLOCKS_CONVERT);
3874                 if (ret < 0) {
3875                         err = ret;
3876                         goto out2;
3877                 }
3878                 /*
3879                  * shouldn't get a 0 return when splitting an extent unless
3880                  * m_len is 0 (bug) or extent has been corrupted
3881                  */
3882                 if (unlikely(ret == 0)) {
3883                         EXT4_ERROR_INODE(inode,
3884                                          "unexpected ret == 0, m_len = %u",
3885                                          map->m_len);
3886                         err = -EFSCORRUPTED;
3887                         goto out2;
3888                 }
3889                 map->m_flags |= EXT4_MAP_UNWRITTEN;
3890                 goto out;
3891         }
3892         /* IO end_io complete, convert the filled extent to written */
3893         if (flags & EXT4_GET_BLOCKS_CONVERT) {
3894                 err = ext4_convert_unwritten_extents_endio(handle, inode, map,
3895                                                            ppath);
3896                 if (err < 0)
3897                         goto out2;
3898                 ext4_update_inode_fsync_trans(handle, inode, 1);
3899                 goto map_out;
3900         }
3901         /* buffered IO cases */
3902         /*
3903          * repeat fallocate creation request
3904          * we already have an unwritten extent
3905          */
3906         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
3907                 map->m_flags |= EXT4_MAP_UNWRITTEN;
3908                 goto map_out;
3909         }
3910 
3911         /* buffered READ or buffered write_begin() lookup */
3912         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3913                 /*
3914                  * We have blocks reserved already.  We
3915                  * return allocated blocks so that delalloc
3916                  * won't do block reservation for us.  But
3917                  * the buffer head will be unmapped so that
3918                  * a read from the block returns 0s.
3919                  */
3920                 map->m_flags |= EXT4_MAP_UNWRITTEN;
3921                 goto out1;
3922         }
3923 
3924         /*
3925          * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
3926          * For buffered writes, at writepage time, etc.  Convert a
3927          * discovered unwritten extent to written.
3928          */
3929         ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
3930         if (ret < 0) {
3931                 err = ret;
3932                 goto out2;
3933         }
3934         ext4_update_inode_fsync_trans(handle, inode, 1);
3935         /*
3936          * shouldn't get a 0 return when converting an unwritten extent
3937          * unless m_len is 0 (bug) or extent has been corrupted
3938          */
3939         if (unlikely(ret == 0)) {
3940                 EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
3941                                  map->m_len);
3942                 err = -EFSCORRUPTED;
3943                 goto out2;
3944         }
3945 
3946 out:
3947         allocated = ret;
3948         map->m_flags |= EXT4_MAP_NEW;
3949 map_out:
3950         map->m_flags |= EXT4_MAP_MAPPED;
3951 out1:
3952         map->m_pblk = newblock;
3953         if (allocated > map->m_len)
3954                 allocated = map->m_len;
3955         map->m_len = allocated;
3956         ext4_ext_show_leaf(inode, *ppath);
3957 out2:
3958         return err ? err : allocated;
3959 }
3960 
3961 /*
3962  * get_implied_cluster_alloc - check to see if the requested
3963  * allocation (in the map structure) overlaps with a cluster already
3964  * allocated in an extent.
3965  *      @sb     The filesystem superblock structure
3966  *      @map    The requested lblk->pblk mapping
3967  *      @ex     The extent structure which might contain an implied
3968  *                      cluster allocation
3969  *
3970  * This function is called by ext4_ext_map_blocks() after we failed to
3971  * find blocks that were already in the inode's extent tree.  Hence,
3972  * we know that the beginning of the requested region cannot overlap
3973  * the extent from the inode's extent tree.  There are three cases we
3974  * want to catch.  The first is this case:
3975  *
3976  *               |--- cluster # N--|
3977  *    |--- extent ---|  |---- requested region ---|
3978  *                      |==========|
3979  *
3980  * The second case that we need to test for is this one:
3981  *
3982  *   |--------- cluster # N ----------------|
3983  *         |--- requested region --|   |------- extent ----|
3984  *         |=======================|
3985  *
3986  * The third case is when the requested region lies between two extents
3987  * within the same cluster:
3988  *          |------------- cluster # N-------------|
3989  * |----- ex -----|                  |---- ex_right ----|
3990  *                  |------ requested region ------|
3991  *                  |================|
3992  *
3993  * In each of the above cases, we need to set the map->m_pblk and
3994  * map->m_len so it corresponds to the return the extent labelled as
3995  * "|====|" from cluster #N, since it is already in use for data in
3996  * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
3997  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3998  * as a new "allocated" block region.  Otherwise, we will return 0 and
3999  * ext4_ext_map_blocks() will then allocate one or more new clusters
4000  * by calling ext4_mb_new_blocks().
4001  */
4002 static int get_implied_cluster_alloc(struct super_block *sb,
4003                                      struct ext4_map_blocks *map,
4004                                      struct ext4_extent *ex,
4005                                      struct ext4_ext_path *path)
4006 {
4007         struct ext4_sb_info *sbi = EXT4_SB(sb);
4008         ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4009         ext4_lblk_t ex_cluster_start, ex_cluster_end;
4010         ext4_lblk_t rr_cluster_start;
4011         ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4012         ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4013         unsigned short ee_len = ext4_ext_get_actual_len(ex);
4014 
4015         /* The extent passed in that we are trying to match */
4016         ex_cluster_start = EXT4_B2C(sbi, ee_block);
4017         ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4018 
4019         /* The requested region passed into ext4_map_blocks() */
4020         rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4021 
4022         if ((rr_cluster_start == ex_cluster_end) ||
4023             (rr_cluster_start == ex_cluster_start)) {
4024                 if (rr_cluster_start == ex_cluster_end)
4025                         ee_start += ee_len - 1;
4026                 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4027                 map->m_len = min(map->m_len,
4028                                  (unsigned) sbi->s_cluster_ratio - c_offset);
4029                 /*
4030                  * Check for and handle this case:
4031                  *
4032                  *   |--------- cluster # N-------------|
4033                  *                     |------- extent ----|
4034                  *         |--- requested region ---|
4035                  *         |===========|
4036                  */
4037 
4038                 if (map->m_lblk < ee_block)
4039                         map->m_len = min(map->m_len, ee_block - map->m_lblk);
4040 
4041                 /*
4042                  * Check for the case where there is already another allocated
4043                  * block to the right of 'ex' but before the end of the cluster.
4044                  *
4045                  *          |------------- cluster # N-------------|
4046                  * |----- ex -----|                  |---- ex_right ----|
4047                  *                  |------ requested region ------|
4048                  *                  |================|
4049                  */
4050                 if (map->m_lblk > ee_block) {
4051                         ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4052                         map->m_len = min(map->m_len, next - map->m_lblk);
4053                 }
4054 
4055                 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4056                 return 1;
4057         }
4058 
4059         trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4060         return 0;
4061 }
4062 
4063 /*
4064  * Determine hole length around the given logical block, first try to
4065  * locate and expand the hole from the given @path, and then adjust it
4066  * if it's partially or completely converted to delayed extents, insert
4067  * it into the extent cache tree if it's indeed a hole, finally return
4068  * the length of the determined extent.
4069  */
4070 static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
4071                                                   struct ext4_ext_path *path,
4072                                                   ext4_lblk_t lblk)
4073 {
4074         ext4_lblk_t hole_start, len;
4075         struct extent_status es;
4076 
4077         hole_start = lblk;
4078         len = ext4_ext_find_hole(inode, path, &hole_start);
4079 again:
4080         ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
4081                                   hole_start + len - 1, &es);
4082         if (!es.es_len)
4083                 goto insert_hole;
4084 
4085         /*
4086          * There's a delalloc extent in the hole, handle it if the delalloc
4087          * extent is in front of, behind and straddle the queried range.
4088          */
4089         if (lblk >= es.es_lblk + es.es_len) {
4090                 /*
4091                  * The delalloc extent is in front of the queried range,
4092                  * find again from the queried start block.
4093                  */
4094                 len -= lblk - hole_start;
4095                 hole_start = lblk;
4096                 goto again;
4097         } else if (in_range(lblk, es.es_lblk, es.es_len)) {
4098                 /*
4099                  * The delalloc extent containing lblk, it must have been
4100                  * added after ext4_map_blocks() checked the extent status
4101                  * tree so we are not holding i_rwsem and delalloc info is
4102                  * only stabilized by i_data_sem we are going to release
4103                  * soon. Don't modify the extent status tree and report
4104                  * extent as a hole, just adjust the length to the delalloc
4105                  * extent's after lblk.
4106                  */
4107                 len = es.es_lblk + es.es_len - lblk;
4108                 return len;
4109         } else {
4110                 /*
4111                  * The delalloc extent is partially or completely behind
4112                  * the queried range, update hole length until the
4113                  * beginning of the delalloc extent.
4114                  */
4115                 len = min(es.es_lblk - hole_start, len);
4116         }
4117 
4118 insert_hole:
4119         /* Put just found gap into cache to speed up subsequent requests */
4120         ext_debug(inode, " -> %u:%u\n", hole_start, len);
4121         ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
4122 
4123         /* Update hole_len to reflect hole size after lblk */
4124         if (hole_start != lblk)
4125                 len -= lblk - hole_start;
4126 
4127         return len;
4128 }
4129 
4130 /*
4131  * Block allocation/map/preallocation routine for extents based files
4132  *
4133  *
4134  * Need to be called with
4135  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4136  * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4137  *
4138  * return > 0, number of blocks already mapped/allocated
4139  *          if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
4140  *              buffer head is unmapped
4141  *          otherwise blocks are mapped
4142  *
4143  * return = 0, if plain look up failed (blocks have not been allocated)
4144  *          buffer head is unmapped
4145  *
4146  * return < 0, error case.
4147  */
4148 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4149                         struct ext4_map_blocks *map, int flags)
4150 {
4151         struct ext4_ext_path *path = NULL;
4152         struct ext4_extent newex, *ex, ex2;
4153         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4154         ext4_fsblk_t newblock = 0, pblk;
4155         int err = 0, depth, ret;
4156         unsigned int allocated = 0, offset = 0;
4157         unsigned int allocated_clusters = 0;
4158         struct ext4_allocation_request ar;
4159         ext4_lblk_t cluster_offset;
4160 
4161         ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
4162         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4163 
4164         /* find extent for this block */
4165         path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4166         if (IS_ERR(path)) {
4167                 err = PTR_ERR(path);
4168                 path = NULL;
4169                 goto out;
4170         }
4171 
4172         depth = ext_depth(inode);
4173 
4174         /*
4175          * consistent leaf must not be empty;
4176          * this situation is possible, though, _during_ tree modification;
4177          * this is why assert can't be put in ext4_find_extent()
4178          */
4179         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4180                 EXT4_ERROR_INODE(inode, "bad extent address "
4181                                  "lblock: %lu, depth: %d pblock %lld",
4182                                  (unsigned long) map->m_lblk, depth,
4183                                  path[depth].p_block);
4184                 err = -EFSCORRUPTED;
4185                 goto out;
4186         }
4187 
4188         ex = path[depth].p_ext;
4189         if (ex) {
4190                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4191                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4192                 unsigned short ee_len;
4193 
4194 
4195                 /*
4196                  * unwritten extents are treated as holes, except that
4197                  * we split out initialized portions during a write.
4198                  */
4199                 ee_len = ext4_ext_get_actual_len(ex);
4200 
4201                 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4202 
4203                 /* if found extent covers block, simply return it */
4204                 if (in_range(map->m_lblk, ee_block, ee_len)) {
4205                         newblock = map->m_lblk - ee_block + ee_start;
4206                         /* number of remaining blocks in the extent */
4207                         allocated = ee_len - (map->m_lblk - ee_block);
4208                         ext_debug(inode, "%u fit into %u:%d -> %llu\n",
4209                                   map->m_lblk, ee_block, ee_len, newblock);
4210 
4211                         /*
4212                          * If the extent is initialized check whether the
4213                          * caller wants to convert it to unwritten.
4214                          */
4215                         if ((!ext4_ext_is_unwritten(ex)) &&
4216                             (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4217                                 err = convert_initialized_extent(handle,
4218                                         inode, map, &path, &allocated);
4219                                 goto out;
4220                         } else if (!ext4_ext_is_unwritten(ex)) {
4221                                 map->m_flags |= EXT4_MAP_MAPPED;
4222                                 map->m_pblk = newblock;
4223                                 if (allocated > map->m_len)
4224                                         allocated = map->m_len;
4225                                 map->m_len = allocated;
4226                                 ext4_ext_show_leaf(inode, path);
4227                                 goto out;
4228                         }
4229 
4230                         ret = ext4_ext_handle_unwritten_extents(
4231                                 handle, inode, map, &path, flags,
4232                                 allocated, newblock);
4233                         if (ret < 0)
4234                                 err = ret;
4235                         else
4236                                 allocated = ret;
4237                         goto out;
4238                 }
4239         }
4240 
4241         /*
4242          * requested block isn't allocated yet;
4243          * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
4244          */
4245         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4246                 ext4_lblk_t len;
4247 
4248                 len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);
4249 
4250                 map->m_pblk = 0;
4251                 map->m_len = min_t(unsigned int, map->m_len, len);
4252                 goto out;
4253         }
4254 
4255         /*
4256          * Okay, we need to do block allocation.
4257          */
4258         newex.ee_block = cpu_to_le32(map->m_lblk);
4259         cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4260 
4261         /*
4262          * If we are doing bigalloc, check to see if the extent returned
4263          * by ext4_find_extent() implies a cluster we can use.
4264          */
4265         if (cluster_offset && ex &&
4266             get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4267                 ar.len = allocated = map->m_len;
4268                 newblock = map->m_pblk;
4269                 goto got_allocated_blocks;
4270         }
4271 
4272         /* find neighbour allocated blocks */
4273         ar.lleft = map->m_lblk;
4274         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4275         if (err)
4276                 goto out;
4277         ar.lright = map->m_lblk;
4278         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4279         if (err < 0)
4280                 goto out;
4281 
4282         /* Check if the extent after searching to the right implies a
4283          * cluster we can use. */
4284         if ((sbi->s_cluster_ratio > 1) && err &&
4285             get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
4286                 ar.len = allocated = map->m_len;
4287                 newblock = map->m_pblk;
4288                 goto got_allocated_blocks;
4289         }
4290 
4291         /*
4292          * See if request is beyond maximum number of blocks we can have in
4293          * a single extent. For an initialized extent this limit is
4294          * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4295          * EXT_UNWRITTEN_MAX_LEN.
4296          */
4297         if (map->m_len > EXT_INIT_MAX_LEN &&
4298             !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4299                 map->m_len = EXT_INIT_MAX_LEN;
4300         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4301                  (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4302                 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4303 
4304         /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4305         newex.ee_len = cpu_to_le16(map->m_len);
4306         err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4307         if (err)
4308                 allocated = ext4_ext_get_actual_len(&newex);
4309         else
4310                 allocated = map->m_len;
4311 
4312         /* allocate new block */
4313         ar.inode = inode;
4314         ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4315         ar.logical = map->m_lblk;
4316         /*
4317          * We calculate the offset from the beginning of the cluster
4318          * for the logical block number, since when we allocate a
4319          * physical cluster, the physical block should start at the
4320          * same offset from the beginning of the cluster.  This is
4321          * needed so that future calls to get_implied_cluster_alloc()
4322          * work correctly.
4323          */
4324         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4325         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4326         ar.goal -= offset;
4327         ar.logical -= offset;
4328         if (S_ISREG(inode->i_mode))
4329                 ar.flags = EXT4_MB_HINT_DATA;
4330         else
4331                 /* disable in-core preallocation for non-regular files */
4332                 ar.flags = 0;
4333         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4334                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4335         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4336                 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4337         if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4338                 ar.flags |= EXT4_MB_USE_RESERVED;
4339         newblock = ext4_mb_new_blocks(handle, &ar, &err);
4340         if (!newblock)
4341                 goto out;
4342         allocated_clusters = ar.len;
4343         ar.len = EXT4_C2B(sbi, ar.len) - offset;
4344         ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
4345                   ar.goal, newblock, ar.len, allocated);
4346         if (ar.len > allocated)
4347                 ar.len = allocated;
4348 
4349 got_allocated_blocks:
4350         /* try to insert new extent into found leaf and return */
4351         pblk = newblock + offset;
4352         ext4_ext_store_pblock(&newex, pblk);
4353         newex.ee_len = cpu_to_le16(ar.len);
4354         /* Mark unwritten */
4355         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4356                 ext4_ext_mark_unwritten(&newex);
4357                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4358         }
4359 
4360         err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
4361         if (err) {
4362                 if (allocated_clusters) {
4363                         int fb_flags = 0;
4364 
4365                         /*
4366                          * free data blocks we just allocated.
4367                          * not a good idea to call discard here directly,
4368                          * but otherwise we'd need to call it every free().
4369                          */
4370                         ext4_discard_preallocations(inode);
4371                         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4372                                 fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
4373                         ext4_free_blocks(handle, inode, NULL, newblock,
4374                                          EXT4_C2B(sbi, allocated_clusters),
4375                                          fb_flags);
4376                 }
4377                 goto out;
4378         }
4379 
4380         /*
4381          * Reduce the reserved cluster count to reflect successful deferred
4382          * allocation of delayed allocated clusters or direct allocation of
4383          * clusters discovered to be delayed allocated.  Once allocated, a
4384          * cluster is not included in the reserved count.
4385          */
4386         if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
4387                 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4388                         /*
4389                          * When allocating delayed allocated clusters, simply
4390                          * reduce the reserved cluster count and claim quota
4391                          */
4392                         ext4_da_update_reserve_space(inode, allocated_clusters,
4393                                                         1);
4394                 } else {
4395                         ext4_lblk_t lblk, len;
4396                         unsigned int n;
4397 
4398                         /*
4399                          * When allocating non-delayed allocated clusters
4400                          * (from fallocate, filemap, DIO, or clusters
4401                          * allocated when delalloc has been disabled by
4402                          * ext4_nonda_switch), reduce the reserved cluster
4403                          * count by the number of allocated clusters that
4404                          * have previously been delayed allocated.  Quota
4405                          * has been claimed by ext4_mb_new_blocks() above,
4406                          * so release the quota reservations made for any
4407                          * previously delayed allocated clusters.
4408                          */
4409                         lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4410                         len = allocated_clusters << sbi->s_cluster_bits;
4411                         n = ext4_es_delayed_clu(inode, lblk, len);
4412                         if (n > 0)
4413                                 ext4_da_update_reserve_space(inode, (int) n, 0);
4414                 }
4415         }
4416 
4417         /*
4418          * Cache the extent and update transaction to commit on fdatasync only
4419          * when it is _not_ an unwritten extent.
4420          */
4421         if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4422                 ext4_update_inode_fsync_trans(handle, inode, 1);
4423         else
4424                 ext4_update_inode_fsync_trans(handle, inode, 0);
4425 
4426         map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
4427         map->m_pblk = pblk;
4428         map->m_len = ar.len;
4429         allocated = map->m_len;
4430         ext4_ext_show_leaf(inode, path);
4431 out:
4432         ext4_free_ext_path(path);
4433 
4434         trace_ext4_ext_map_blocks_exit(inode, flags, map,
4435                                        err ? err : allocated);
4436         return err ? err : allocated;
4437 }
4438 
4439 int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4440 {
4441         struct super_block *sb = inode->i_sb;
4442         ext4_lblk_t last_block;
4443         int err = 0;
4444 
4445         /*
4446          * TODO: optimization is possible here.
4447          * Probably we need not scan at all,
4448          * because page truncation is enough.
4449          */
4450 
4451         /* we have to know where to truncate from in crash case */
4452         EXT4_I(inode)->i_disksize = inode->i_size;
4453         err = ext4_mark_inode_dirty(handle, inode);
4454         if (err)
4455                 return err;
4456 
4457         last_block = (inode->i_size + sb->s_blocksize - 1)
4458                         >> EXT4_BLOCK_SIZE_BITS(sb);
4459         ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
4460 
4461 retry_remove_space:
4462         err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4463         if (err == -ENOMEM) {
4464                 memalloc_retry_wait(GFP_ATOMIC);
4465                 goto retry_remove_space;
4466         }
4467         return err;
4468 }
4469 
4470 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4471                                   ext4_lblk_t len, loff_t new_size,
4472                                   int flags)
4473 {
4474         struct inode *inode = file_inode(file);
4475         handle_t *handle;
4476         int ret = 0, ret2 = 0, ret3 = 0;
4477         int retries = 0;
4478         int depth = 0;
4479         struct ext4_map_blocks map;
4480         unsigned int credits;
4481         loff_t epos;
4482 
4483         BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4484         map.m_lblk = offset;
4485         map.m_len = len;
4486         /*
4487          * Don't normalize the request if it can fit in one extent so
4488          * that it doesn't get unnecessarily split into multiple
4489          * extents.
4490          */
4491         if (len <= EXT_UNWRITTEN_MAX_LEN)
4492                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4493 
4494         /*
4495          * credits to insert 1 extent into extent tree
4496          */
4497         credits = ext4_chunk_trans_blocks(inode, len);
4498         depth = ext_depth(inode);
4499 
4500 retry:
4501         while (len) {
4502                 /*
4503                  * Recalculate credits when extent tree depth changes.
4504                  */
4505                 if (depth != ext_depth(inode)) {
4506                         credits = ext4_chunk_trans_blocks(inode, len);
4507                         depth = ext_depth(inode);
4508                 }
4509 
4510                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4511                                             credits);
4512                 if (IS_ERR(handle)) {
4513                         ret = PTR_ERR(handle);
4514                         break;
4515                 }
4516                 ret = ext4_map_blocks(handle, inode, &map, flags);
4517                 if (ret <= 0) {
4518                         ext4_debug("inode #%lu: block %u: len %u: "
4519                                    "ext4_ext_map_blocks returned %d",
4520                                    inode->i_ino, map.m_lblk,
4521                                    map.m_len, ret);
4522                         ext4_mark_inode_dirty(handle, inode);
4523                         ext4_journal_stop(handle);
4524                         break;
4525                 }
4526                 /*
4527                  * allow a full retry cycle for any remaining allocations
4528                  */
4529                 retries = 0;
4530                 map.m_lblk += ret;
4531                 map.m_len = len = len - ret;
4532                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4533                 inode_set_ctime_current(inode);
4534                 if (new_size) {
4535                         if (epos > new_size)
4536                                 epos = new_size;
4537                         if (ext4_update_inode_size(inode, epos) & 0x1)
4538                                 inode_set_mtime_to_ts(inode,
4539                                                       inode_get_ctime(inode));
4540                 }
4541                 ret2 = ext4_mark_inode_dirty(handle, inode);
4542                 ext4_update_inode_fsync_trans(handle, inode, 1);
4543                 ret3 = ext4_journal_stop(handle);
4544                 ret2 = ret3 ? ret3 : ret2;
4545                 if (unlikely(ret2))
4546                         break;
4547         }
4548         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
4549                 goto retry;
4550 
4551         return ret > 0 ? ret2 : ret;
4552 }
4553 
4554 static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
4555 
4556 static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
4557 
4558 static long ext4_zero_range(struct file *file, loff_t offset,
4559                             loff_t len, int mode)
4560 {
4561         struct inode *inode = file_inode(file);
4562         struct address_space *mapping = file->f_mapping;
4563         handle_t *handle = NULL;
4564         unsigned int max_blocks;
4565         loff_t new_size = 0;
4566         int ret = 0;
4567         int flags;
4568         int credits;
4569         int partial_begin, partial_end;
4570         loff_t start, end;
4571         ext4_lblk_t lblk;
4572         unsigned int blkbits = inode->i_blkbits;
4573 
4574         trace_ext4_zero_range(inode, offset, len, mode);
4575 
4576         /*
4577          * Round up offset. This is not fallocate, we need to zero out
4578          * blocks, so convert interior block aligned part of the range to
4579          * unwritten and possibly manually zero out unaligned parts of the
4580          * range. Here, start and partial_begin are inclusive, end and
4581          * partial_end are exclusive.
4582          */
4583         start = round_up(offset, 1 << blkbits);
4584         end = round_down((offset + len), 1 << blkbits);
4585 
4586         if (start < offset || end > offset + len)
4587                 return -EINVAL;
4588         partial_begin = offset & ((1 << blkbits) - 1);
4589         partial_end = (offset + len) & ((1 << blkbits) - 1);
4590 
4591         lblk = start >> blkbits;
4592         max_blocks = (end >> blkbits);
4593         if (max_blocks < lblk)
4594                 max_blocks = 0;
4595         else
4596                 max_blocks -= lblk;
4597 
4598         inode_lock(inode);
4599 
4600         /*
4601          * Indirect files do not support unwritten extents
4602          */
4603         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4604                 ret = -EOPNOTSUPP;
4605                 goto out_mutex;
4606         }
4607 
4608         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4609             (offset + len > inode->i_size ||
4610              offset + len > EXT4_I(inode)->i_disksize)) {
4611                 new_size = offset + len;
4612                 ret = inode_newsize_ok(inode, new_size);
4613                 if (ret)
4614                         goto out_mutex;
4615         }
4616 
4617         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4618 
4619         /* Wait all existing dio workers, newcomers will block on i_rwsem */
4620         inode_dio_wait(inode);
4621 
4622         ret = file_modified(file);
4623         if (ret)
4624                 goto out_mutex;
4625 
4626         /* Preallocate the range including the unaligned edges */
4627         if (partial_begin || partial_end) {
4628                 ret = ext4_alloc_file_blocks(file,
4629                                 round_down(offset, 1 << blkbits) >> blkbits,
4630                                 (round_up((offset + len), 1 << blkbits) -
4631                                  round_down(offset, 1 << blkbits)) >> blkbits,
4632                                 new_size, flags);
4633                 if (ret)
4634                         goto out_mutex;
4635 
4636         }
4637 
4638         /* Zero range excluding the unaligned edges */
4639         if (max_blocks > 0) {
4640                 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4641                           EXT4_EX_NOCACHE);
4642 
4643                 /*
4644                  * Prevent page faults from reinstantiating pages we have
4645                  * released from page cache.
4646                  */
4647                 filemap_invalidate_lock(mapping);
4648 
4649                 ret = ext4_break_layouts(inode);
4650                 if (ret) {
4651                         filemap_invalidate_unlock(mapping);
4652                         goto out_mutex;
4653                 }
4654 
4655                 ret = ext4_update_disksize_before_punch(inode, offset, len);
4656                 if (ret) {
4657                         filemap_invalidate_unlock(mapping);
4658                         goto out_mutex;
4659                 }
4660 
4661                 /*
4662                  * For journalled data we need to write (and checkpoint) pages
4663                  * before discarding page cache to avoid inconsitent data on
4664                  * disk in case of crash before zeroing trans is committed.
4665                  */
4666                 if (ext4_should_journal_data(inode)) {
4667                         ret = filemap_write_and_wait_range(mapping, start,
4668                                                            end - 1);
4669                         if (ret) {
4670                                 filemap_invalidate_unlock(mapping);
4671                                 goto out_mutex;
4672                         }
4673                 }
4674 
4675                 /* Now release the pages and zero block aligned part of pages */
4676                 truncate_pagecache_range(inode, start, end - 1);
4677                 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
4678 
4679                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4680                                              flags);
4681                 filemap_invalidate_unlock(mapping);
4682                 if (ret)
4683                         goto out_mutex;
4684         }
4685         if (!partial_begin && !partial_end)
4686                 goto out_mutex;
4687 
4688         /*
4689          * In worst case we have to writeout two nonadjacent unwritten
4690          * blocks and update the inode
4691          */
4692         credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4693         if (ext4_should_journal_data(inode))
4694                 credits += 2;
4695         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4696         if (IS_ERR(handle)) {
4697                 ret = PTR_ERR(handle);
4698                 ext4_std_error(inode->i_sb, ret);
4699                 goto out_mutex;
4700         }
4701 
4702         inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
4703         if (new_size)
4704                 ext4_update_inode_size(inode, new_size);
4705         ret = ext4_mark_inode_dirty(handle, inode);
4706         if (unlikely(ret))
4707                 goto out_handle;
4708         /* Zero out partial block at the edges of the range */
4709         ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4710         if (ret >= 0)
4711                 ext4_update_inode_fsync_trans(handle, inode, 1);
4712 
4713         if (file->f_flags & O_SYNC)
4714                 ext4_handle_sync(handle);
4715 
4716 out_handle:
4717         ext4_journal_stop(handle);
4718 out_mutex:
4719         inode_unlock(inode);
4720         return ret;
4721 }
4722 
4723 /*
4724  * preallocate space for a file. This implements ext4's fallocate file
4725  * operation, which gets called from sys_fallocate system call.
4726  * For block-mapped files, posix_fallocate should fall back to the method
4727  * of writing zeroes to the required new blocks (the same behavior which is
4728  * expected for file systems which do not support fallocate() system call).
4729  */
4730 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4731 {
4732         struct inode *inode = file_inode(file);
4733         loff_t new_size = 0;
4734         unsigned int max_blocks;
4735         int ret = 0;
4736         int flags;
4737         ext4_lblk_t lblk;
4738         unsigned int blkbits = inode->i_blkbits;
4739 
4740         /*
4741          * Encrypted inodes can't handle collapse range or insert
4742          * range since we would need to re-encrypt blocks with a
4743          * different IV or XTS tweak (which are based on the logical
4744          * block number).
4745          */
4746         if (IS_ENCRYPTED(inode) &&
4747             (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4748                 return -EOPNOTSUPP;
4749 
4750         /* Return error if mode is not supported */
4751         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4752                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4753                      FALLOC_FL_INSERT_RANGE))
4754                 return -EOPNOTSUPP;
4755 
4756         inode_lock(inode);
4757         ret = ext4_convert_inline_data(inode);
4758         inode_unlock(inode);
4759         if (ret)
4760                 goto exit;
4761 
4762         if (mode & FALLOC_FL_PUNCH_HOLE) {
4763                 ret = ext4_punch_hole(file, offset, len);
4764                 goto exit;
4765         }
4766 
4767         if (mode & FALLOC_FL_COLLAPSE_RANGE) {
4768                 ret = ext4_collapse_range(file, offset, len);
4769                 goto exit;
4770         }
4771 
4772         if (mode & FALLOC_FL_INSERT_RANGE) {
4773                 ret = ext4_insert_range(file, offset, len);
4774                 goto exit;
4775         }
4776 
4777         if (mode & FALLOC_FL_ZERO_RANGE) {
4778                 ret = ext4_zero_range(file, offset, len, mode);
4779                 goto exit;
4780         }
4781         trace_ext4_fallocate_enter(inode, offset, len, mode);
4782         lblk = offset >> blkbits;
4783 
4784         max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4785         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4786 
4787         inode_lock(inode);
4788 
4789         /*
4790          * We only support preallocation for extent-based files only
4791          */
4792         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4793                 ret = -EOPNOTSUPP;
4794                 goto out;
4795         }
4796 
4797         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4798             (offset + len > inode->i_size ||
4799              offset + len > EXT4_I(inode)->i_disksize)) {
4800                 new_size = offset + len;
4801                 ret = inode_newsize_ok(inode, new_size);
4802                 if (ret)
4803                         goto out;
4804         }
4805 
4806         /* Wait all existing dio workers, newcomers will block on i_rwsem */
4807         inode_dio_wait(inode);
4808 
4809         ret = file_modified(file);
4810         if (ret)
4811                 goto out;
4812 
4813         ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4814         if (ret)
4815                 goto out;
4816 
4817         if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4818                 ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
4819                                         EXT4_I(inode)->i_sync_tid);
4820         }
4821 out:
4822         inode_unlock(inode);
4823         trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4824 exit:
4825         return ret;
4826 }
4827 
4828 /*
4829  * This function convert a range of blocks to written extents
4830  * The caller of this function will pass the start offset and the size.
4831  * all unwritten extents within this range will be converted to
4832  * written extents.
4833  *
4834  * This function is called from the direct IO end io call back
4835  * function, to convert the fallocated extents after IO is completed.
4836  * Returns 0 on success.
4837  */
4838 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4839                                    loff_t offset, ssize_t len)
4840 {
4841         unsigned int max_blocks;
4842         int ret = 0, ret2 = 0, ret3 = 0;
4843         struct ext4_map_blocks map;
4844         unsigned int blkbits = inode->i_blkbits;
4845         unsigned int credits = 0;
4846 
4847         map.m_lblk = offset >> blkbits;
4848         max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4849 
4850         if (!handle) {
4851                 /*
4852                  * credits to insert 1 extent into extent tree
4853                  */
4854                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4855         }
4856         while (ret >= 0 && ret < max_blocks) {
4857                 map.m_lblk += ret;
4858                 map.m_len = (max_blocks -= ret);
4859                 if (credits) {
4860                         handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4861                                                     credits);
4862                         if (IS_ERR(handle)) {
4863                                 ret = PTR_ERR(handle);
4864                                 break;
4865                         }
4866                 }
4867                 ret = ext4_map_blocks(handle, inode, &map,
4868                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4869                 if (ret <= 0)
4870                         ext4_warning(inode->i_sb,
4871                                      "inode #%lu: block %u: len %u: "
4872                                      "ext4_ext_map_blocks returned %d",
4873                                      inode->i_ino, map.m_lblk,
4874                                      map.m_len, ret);
4875                 ret2 = ext4_mark_inode_dirty(handle, inode);
4876                 if (credits) {
4877                         ret3 = ext4_journal_stop(handle);
4878                         if (unlikely(ret3))
4879                                 ret2 = ret3;
4880                 }
4881 
4882                 if (ret <= 0 || ret2)
4883                         break;
4884         }
4885         return ret > 0 ? ret2 : ret;
4886 }
4887 
4888 int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4889 {
4890         int ret = 0, err = 0;
4891         struct ext4_io_end_vec *io_end_vec;
4892 
4893         /*
4894          * This is somewhat ugly but the idea is clear: When transaction is
4895          * reserved, everything goes into it. Otherwise we rather start several
4896          * smaller transactions for conversion of each extent separately.
4897          */
4898         if (handle) {
4899                 handle = ext4_journal_start_reserved(handle,
4900                                                      EXT4_HT_EXT_CONVERT);
4901                 if (IS_ERR(handle))
4902                         return PTR_ERR(handle);
4903         }
4904 
4905         list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4906                 ret = ext4_convert_unwritten_extents(handle, io_end->inode,
4907                                                      io_end_vec->offset,
4908                                                      io_end_vec->size);
4909                 if (ret)
4910                         break;
4911         }
4912 
4913         if (handle)
4914                 err = ext4_journal_stop(handle);
4915 
4916         return ret < 0 ? ret : err;
4917 }
4918 
4919 static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
4920 {
4921         __u64 physical = 0;
4922         __u64 length = 0;
4923         int blockbits = inode->i_sb->s_blocksize_bits;
4924         int error = 0;
4925         u16 iomap_type;
4926 
4927         /* in-inode? */
4928         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4929                 struct ext4_iloc iloc;
4930                 int offset;     /* offset of xattr in inode */
4931 
4932                 error = ext4_get_inode_loc(inode, &iloc);
4933                 if (error)
4934                         return error;
4935                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4936                 offset = EXT4_GOOD_OLD_INODE_SIZE +
4937                                 EXT4_I(inode)->i_extra_isize;
4938                 physical += offset;
4939                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4940                 brelse(iloc.bh);
4941                 iomap_type = IOMAP_INLINE;
4942         } else if (EXT4_I(inode)->i_file_acl) { /* external block */
4943                 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4944                 length = inode->i_sb->s_blocksize;
4945                 iomap_type = IOMAP_MAPPED;
4946         } else {
4947                 /* no in-inode or external block for xattr, so return -ENOENT */
4948                 error = -ENOENT;
4949                 goto out;
4950         }
4951 
4952         iomap->addr = physical;
4953         iomap->offset = 0;
4954         iomap->length = length;
4955         iomap->type = iomap_type;
4956         iomap->flags = 0;
4957 out:
4958         return error;
4959 }
4960 
4961 static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
4962                                   loff_t length, unsigned flags,
4963                                   struct iomap *iomap, struct iomap *srcmap)
4964 {
4965         int error;
4966 
4967         error = ext4_iomap_xattr_fiemap(inode, iomap);
4968         if (error == 0 && (offset >= iomap->length))
4969                 error = -ENOENT;
4970         return error;
4971 }
4972 
4973 static const struct iomap_ops ext4_iomap_xattr_ops = {
4974         .iomap_begin            = ext4_iomap_xattr_begin,
4975 };
4976 
4977 static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
4978 {
4979         u64 maxbytes;
4980 
4981         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4982                 maxbytes = inode->i_sb->s_maxbytes;
4983         else
4984                 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
4985 
4986         if (*len == 0)
4987                 return -EINVAL;
4988         if (start > maxbytes)
4989                 return -EFBIG;
4990 
4991         /*
4992          * Shrink request scope to what the fs can actually handle.
4993          */
4994         if (*len > maxbytes || (maxbytes - *len) < start)
4995                 *len = maxbytes - start;
4996         return 0;
4997 }
4998 
4999 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5000                 u64 start, u64 len)
5001 {
5002         int error = 0;
5003 
5004         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5005                 error = ext4_ext_precache(inode);
5006                 if (error)
5007                         return error;
5008                 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5009         }
5010 
5011         /*
5012          * For bitmap files the maximum size limit could be smaller than
5013          * s_maxbytes, so check len here manually instead of just relying on the
5014          * generic check.
5015          */
5016         error = ext4_fiemap_check_ranges(inode, start, &len);
5017         if (error)
5018                 return error;
5019 
5020         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5021                 fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
5022                 return iomap_fiemap(inode, fieinfo, start, len,
5023                                     &ext4_iomap_xattr_ops);
5024         }
5025 
5026         return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
5027 }
5028 
5029 int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
5030                       __u64 start, __u64 len)
5031 {
5032         ext4_lblk_t start_blk, len_blks;
5033         __u64 last_blk;
5034         int error = 0;
5035 
5036         if (ext4_has_inline_data(inode)) {
5037                 int has_inline;
5038 
5039                 down_read(&EXT4_I(inode)->xattr_sem);
5040                 has_inline = ext4_has_inline_data(inode);
5041                 up_read(&EXT4_I(inode)->xattr_sem);
5042                 if (has_inline)
5043                         return 0;
5044         }
5045 
5046         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5047                 error = ext4_ext_precache(inode);
5048                 if (error)
5049                         return error;
5050                 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5051         }
5052 
5053         error = fiemap_prep(inode, fieinfo, start, &len, 0);
5054         if (error)
5055                 return error;
5056 
5057         error = ext4_fiemap_check_ranges(inode, start, &len);
5058         if (error)
5059                 return error;
5060 
5061         start_blk = start >> inode->i_sb->s_blocksize_bits;
5062         last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5063         if (last_blk >= EXT_MAX_BLOCKS)
5064                 last_blk = EXT_MAX_BLOCKS-1;
5065         len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5066 
5067         /*
5068          * Walk the extent tree gathering extent information
5069          * and pushing extents back to the user.
5070          */
5071         return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
5072 }
5073 
5074 /*
5075  * ext4_ext_shift_path_extents:
5076  * Shift the extents of a path structure lying between path[depth].p_ext
5077  * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5078  * if it is right shift or left shift operation.
5079  */
5080 static int
5081 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5082                             struct inode *inode, handle_t *handle,
5083                             enum SHIFT_DIRECTION SHIFT)
5084 {
5085         int depth, err = 0;
5086         struct ext4_extent *ex_start, *ex_last;
5087         bool update = false;
5088         int credits, restart_credits;
5089         depth = path->p_depth;
5090 
5091         while (depth >= 0) {
5092                 if (depth == path->p_depth) {
5093                         ex_start = path[depth].p_ext;
5094                         if (!ex_start)
5095                                 return -EFSCORRUPTED;
5096 
5097                         ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5098                         /* leaf + sb + inode */
5099                         credits = 3;
5100                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
5101                                 update = true;
5102                                 /* extent tree + sb + inode */
5103                                 credits = depth + 2;
5104                         }
5105 
5106                         restart_credits = ext4_writepage_trans_blocks(inode);
5107                         err = ext4_datasem_ensure_credits(handle, inode, credits,
5108                                         restart_credits, 0);
5109                         if (err) {
5110                                 if (err > 0)
5111                                         err = -EAGAIN;
5112                                 goto out;
5113                         }
5114 
5115                         err = ext4_ext_get_access(handle, inode, path + depth);
5116                         if (err)
5117                                 goto out;
5118 
5119                         while (ex_start <= ex_last) {
5120                                 if (SHIFT == SHIFT_LEFT) {
5121                                         le32_add_cpu(&ex_start->ee_block,
5122                                                 -shift);
5123                                         /* Try to merge to the left. */
5124                                         if ((ex_start >
5125                                             EXT_FIRST_EXTENT(path[depth].p_hdr))
5126                                             &&
5127                                             ext4_ext_try_to_merge_right(inode,
5128                                             path, ex_start - 1))
5129                                                 ex_last--;
5130                                         else
5131                                                 ex_start++;
5132                                 } else {
5133                                         le32_add_cpu(&ex_last->ee_block, shift);
5134                                         ext4_ext_try_to_merge_right(inode, path,
5135                                                 ex_last);
5136                                         ex_last--;
5137                                 }
5138                         }
5139                         err = ext4_ext_dirty(handle, inode, path + depth);
5140                         if (err)
5141                                 goto out;
5142 
5143                         if (--depth < 0 || !update)
5144                                 break;
5145                 }
5146 
5147                 /* Update index too */
5148                 err = ext4_ext_get_access(handle, inode, path + depth);
5149                 if (err)
5150                         goto out;
5151 
5152                 if (SHIFT == SHIFT_LEFT)
5153                         le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5154                 else
5155                         le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5156                 err = ext4_ext_dirty(handle, inode, path + depth);
5157                 if (err)
5158                         goto out;
5159 
5160                 /* we are done if current index is not a starting index */
5161                 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5162                         break;
5163 
5164                 depth--;
5165         }
5166 
5167 out:
5168         return err;
5169 }
5170 
5171 /*
5172  * ext4_ext_shift_extents:
5173  * All the extents which lies in the range from @start to the last allocated
5174  * block for the @inode are shifted either towards left or right (depending
5175  * upon @SHIFT) by @shift blocks.
5176  * On success, 0 is returned, error otherwise.
5177  */
5178 static int
5179 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5180                        ext4_lblk_t start, ext4_lblk_t shift,
5181                        enum SHIFT_DIRECTION SHIFT)
5182 {
5183         struct ext4_ext_path *path;
5184         int ret = 0, depth;
5185         struct ext4_extent *extent;
5186         ext4_lblk_t stop, *iterator, ex_start, ex_end;
5187         ext4_lblk_t tmp = EXT_MAX_BLOCKS;
5188 
5189         /* Let path point to the last extent */
5190         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5191                                 EXT4_EX_NOCACHE);
5192         if (IS_ERR(path))
5193                 return PTR_ERR(path);
5194 
5195         depth = path->p_depth;
5196         extent = path[depth].p_ext;
5197         if (!extent)
5198                 goto out;
5199 
5200         stop = le32_to_cpu(extent->ee_block);
5201 
5202        /*
5203         * For left shifts, make sure the hole on the left is big enough to
5204         * accommodate the shift.  For right shifts, make sure the last extent
5205         * won't be shifted beyond EXT_MAX_BLOCKS.
5206         */
5207         if (SHIFT == SHIFT_LEFT) {
5208                 path = ext4_find_extent(inode, start - 1, &path,
5209                                         EXT4_EX_NOCACHE);
5210                 if (IS_ERR(path))
5211                         return PTR_ERR(path);
5212                 depth = path->p_depth;
5213                 extent =  path[depth].p_ext;
5214                 if (extent) {
5215                         ex_start = le32_to_cpu(extent->ee_block);
5216                         ex_end = le32_to_cpu(extent->ee_block) +
5217                                 ext4_ext_get_actual_len(extent);
5218                 } else {
5219                         ex_start = 0;
5220                         ex_end = 0;
5221                 }
5222 
5223                 if ((start == ex_start && shift > ex_start) ||
5224                     (shift > start - ex_end)) {
5225                         ret = -EINVAL;
5226                         goto out;
5227                 }
5228         } else {
5229                 if (shift > EXT_MAX_BLOCKS -
5230                     (stop + ext4_ext_get_actual_len(extent))) {
5231                         ret = -EINVAL;
5232                         goto out;
5233                 }
5234         }
5235 
5236         /*
5237          * In case of left shift, iterator points to start and it is increased
5238          * till we reach stop. In case of right shift, iterator points to stop
5239          * and it is decreased till we reach start.
5240          */
5241 again:
5242         ret = 0;
5243         if (SHIFT == SHIFT_LEFT)
5244                 iterator = &start;
5245         else
5246                 iterator = &stop;
5247 
5248         if (tmp != EXT_MAX_BLOCKS)
5249                 *iterator = tmp;
5250 
5251         /*
5252          * Its safe to start updating extents.  Start and stop are unsigned, so
5253          * in case of right shift if extent with 0 block is reached, iterator
5254          * becomes NULL to indicate the end of the loop.
5255          */
5256         while (iterator && start <= stop) {
5257                 path = ext4_find_extent(inode, *iterator, &path,
5258                                         EXT4_EX_NOCACHE);
5259                 if (IS_ERR(path))
5260                         return PTR_ERR(path);
5261                 depth = path->p_depth;
5262                 extent = path[depth].p_ext;
5263                 if (!extent) {
5264                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5265                                          (unsigned long) *iterator);
5266                         return -EFSCORRUPTED;
5267                 }
5268                 if (SHIFT == SHIFT_LEFT && *iterator >
5269                     le32_to_cpu(extent->ee_block)) {
5270                         /* Hole, move to the next extent */
5271                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5272                                 path[depth].p_ext++;
5273                         } else {
5274                                 *iterator = ext4_ext_next_allocated_block(path);
5275                                 continue;
5276                         }
5277                 }
5278 
5279                 tmp = *iterator;
5280                 if (SHIFT == SHIFT_LEFT) {
5281                         extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5282                         *iterator = le32_to_cpu(extent->ee_block) +
5283                                         ext4_ext_get_actual_len(extent);
5284                 } else {
5285                         extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5286                         if (le32_to_cpu(extent->ee_block) > start)
5287                                 *iterator = le32_to_cpu(extent->ee_block) - 1;
5288                         else if (le32_to_cpu(extent->ee_block) == start)
5289                                 iterator = NULL;
5290                         else {
5291                                 extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5292                                 while (le32_to_cpu(extent->ee_block) >= start)
5293                                         extent--;
5294 
5295                                 if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
5296                                         break;
5297 
5298                                 extent++;
5299                                 iterator = NULL;
5300                         }
5301                         path[depth].p_ext = extent;
5302                 }
5303                 ret = ext4_ext_shift_path_extents(path, shift, inode,
5304                                 handle, SHIFT);
5305                 /* iterator can be NULL which means we should break */
5306                 if (ret == -EAGAIN)
5307                         goto again;
5308                 if (ret)
5309                         break;
5310         }
5311 out:
5312         ext4_free_ext_path(path);
5313         return ret;
5314 }
5315 
5316 /*
5317  * ext4_collapse_range:
5318  * This implements the fallocate's collapse range functionality for ext4
5319  * Returns: 0 and non-zero on error.
5320  */
5321 static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
5322 {
5323         struct inode *inode = file_inode(file);
5324         struct super_block *sb = inode->i_sb;
5325         struct address_space *mapping = inode->i_mapping;
5326         ext4_lblk_t punch_start, punch_stop;
5327         handle_t *handle;
5328         unsigned int credits;
5329         loff_t new_size, ioffset;
5330         int ret;
5331 
5332         /*
5333          * We need to test this early because xfstests assumes that a
5334          * collapse range of (0, 1) will return EOPNOTSUPP if the file
5335          * system does not support collapse range.
5336          */
5337         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5338                 return -EOPNOTSUPP;
5339 
5340         /* Collapse range works only on fs cluster size aligned regions. */
5341         if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5342                 return -EINVAL;
5343 
5344         trace_ext4_collapse_range(inode, offset, len);
5345 
5346         punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5347         punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5348 
5349         inode_lock(inode);
5350         /*
5351          * There is no need to overlap collapse range with EOF, in which case
5352          * it is effectively a truncate operation
5353          */
5354         if (offset + len >= inode->i_size) {
5355                 ret = -EINVAL;
5356                 goto out_mutex;
5357         }
5358 
5359         /* Currently just for extent based files */
5360         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5361                 ret = -EOPNOTSUPP;
5362                 goto out_mutex;
5363         }
5364 
5365         /* Wait for existing dio to complete */
5366         inode_dio_wait(inode);
5367 
5368         ret = file_modified(file);
5369         if (ret)
5370                 goto out_mutex;
5371 
5372         /*
5373          * Prevent page faults from reinstantiating pages we have released from
5374          * page cache.
5375          */
5376         filemap_invalidate_lock(mapping);
5377 
5378         ret = ext4_break_layouts(inode);
5379         if (ret)
5380                 goto out_mmap;
5381 
5382         /*
5383          * Need to round down offset to be aligned with page size boundary
5384          * for page size > block size.
5385          */
5386         ioffset = round_down(offset, PAGE_SIZE);
5387         /*
5388          * Write tail of the last page before removed range since it will get
5389          * removed from the page cache below.
5390          */
5391         ret = filemap_write_and_wait_range(mapping, ioffset, offset);
5392         if (ret)
5393                 goto out_mmap;
5394         /*
5395          * Write data that will be shifted to preserve them when discarding
5396          * page cache below. We are also protected from pages becoming dirty
5397          * by i_rwsem and invalidate_lock.
5398          */
5399         ret = filemap_write_and_wait_range(mapping, offset + len,
5400                                            LLONG_MAX);
5401         if (ret)
5402                 goto out_mmap;
5403         truncate_pagecache(inode, ioffset);
5404 
5405         credits = ext4_writepage_trans_blocks(inode);
5406         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5407         if (IS_ERR(handle)) {
5408                 ret = PTR_ERR(handle);
5409                 goto out_mmap;
5410         }
5411         ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
5412 
5413         down_write(&EXT4_I(inode)->i_data_sem);
5414         ext4_discard_preallocations(inode);
5415         ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
5416 
5417         ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5418         if (ret) {
5419                 up_write(&EXT4_I(inode)->i_data_sem);
5420                 goto out_stop;
5421         }
5422         ext4_discard_preallocations(inode);
5423 
5424         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5425                                      punch_stop - punch_start, SHIFT_LEFT);
5426         if (ret) {
5427                 up_write(&EXT4_I(inode)->i_data_sem);
5428                 goto out_stop;
5429         }
5430 
5431         new_size = inode->i_size - len;
5432         i_size_write(inode, new_size);
5433         EXT4_I(inode)->i_disksize = new_size;
5434 
5435         up_write(&EXT4_I(inode)->i_data_sem);
5436         if (IS_SYNC(inode))
5437                 ext4_handle_sync(handle);
5438         inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
5439         ret = ext4_mark_inode_dirty(handle, inode);
5440         ext4_update_inode_fsync_trans(handle, inode, 1);
5441 
5442 out_stop:
5443         ext4_journal_stop(handle);
5444 out_mmap:
5445         filemap_invalidate_unlock(mapping);
5446 out_mutex:
5447         inode_unlock(inode);
5448         return ret;
5449 }
5450 
5451 /*
5452  * ext4_insert_range:
5453  * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5454  * The data blocks starting from @offset to the EOF are shifted by @len
5455  * towards right to create a hole in the @inode. Inode size is increased
5456  * by len bytes.
5457  * Returns 0 on success, error otherwise.
5458  */
5459 static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
5460 {
5461         struct inode *inode = file_inode(file);
5462         struct super_block *sb = inode->i_sb;
5463         struct address_space *mapping = inode->i_mapping;
5464         handle_t *handle;
5465         struct ext4_ext_path *path;
5466         struct ext4_extent *extent;
5467         ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5468         unsigned int credits, ee_len;
5469         int ret = 0, depth, split_flag = 0;
5470         loff_t ioffset;
5471 
5472         /*
5473          * We need to test this early because xfstests assumes that an
5474          * insert range of (0, 1) will return EOPNOTSUPP if the file
5475          * system does not support insert range.
5476          */
5477         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5478                 return -EOPNOTSUPP;
5479 
5480         /* Insert range works only on fs cluster size aligned regions. */
5481         if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5482                 return -EINVAL;
5483 
5484         trace_ext4_insert_range(inode, offset, len);
5485 
5486         offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5487         len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5488 
5489         inode_lock(inode);
5490         /* Currently just for extent based files */
5491         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5492                 ret = -EOPNOTSUPP;
5493                 goto out_mutex;
5494         }
5495 
5496         /* Check whether the maximum file size would be exceeded */
5497         if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5498                 ret = -EFBIG;
5499                 goto out_mutex;
5500         }
5501 
5502         /* Offset must be less than i_size */
5503         if (offset >= inode->i_size) {
5504                 ret = -EINVAL;
5505                 goto out_mutex;
5506         }
5507 
5508         /* Wait for existing dio to complete */
5509         inode_dio_wait(inode);
5510 
5511         ret = file_modified(file);
5512         if (ret)
5513                 goto out_mutex;
5514 
5515         /*
5516          * Prevent page faults from reinstantiating pages we have released from
5517          * page cache.
5518          */
5519         filemap_invalidate_lock(mapping);
5520 
5521         ret = ext4_break_layouts(inode);
5522         if (ret)
5523                 goto out_mmap;
5524 
5525         /*
5526          * Need to round down to align start offset to page size boundary
5527          * for page size > block size.
5528          */
5529         ioffset = round_down(offset, PAGE_SIZE);
5530         /* Write out all dirty pages */
5531         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5532                         LLONG_MAX);
5533         if (ret)
5534                 goto out_mmap;
5535         truncate_pagecache(inode, ioffset);
5536 
5537         credits = ext4_writepage_trans_blocks(inode);
5538         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5539         if (IS_ERR(handle)) {
5540                 ret = PTR_ERR(handle);
5541                 goto out_mmap;
5542         }
5543         ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
5544 
5545         /* Expand file to avoid data loss if there is error while shifting */
5546         inode->i_size += len;
5547         EXT4_I(inode)->i_disksize += len;
5548         inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
5549         ret = ext4_mark_inode_dirty(handle, inode);
5550         if (ret)
5551                 goto out_stop;
5552 
5553         down_write(&EXT4_I(inode)->i_data_sem);
5554         ext4_discard_preallocations(inode);
5555 
5556         path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5557         if (IS_ERR(path)) {
5558                 up_write(&EXT4_I(inode)->i_data_sem);
5559                 ret = PTR_ERR(path);
5560                 goto out_stop;
5561         }
5562 
5563         depth = ext_depth(inode);
5564         extent = path[depth].p_ext;
5565         if (extent) {
5566                 ee_start_lblk = le32_to_cpu(extent->ee_block);
5567                 ee_len = ext4_ext_get_actual_len(extent);
5568 
5569                 /*
5570                  * If offset_lblk is not the starting block of extent, split
5571                  * the extent @offset_lblk
5572                  */
5573                 if ((offset_lblk > ee_start_lblk) &&
5574                                 (offset_lblk < (ee_start_lblk + ee_len))) {
5575                         if (ext4_ext_is_unwritten(extent))
5576                                 split_flag = EXT4_EXT_MARK_UNWRIT1 |
5577                                         EXT4_EXT_MARK_UNWRIT2;
5578                         ret = ext4_split_extent_at(handle, inode, &path,
5579                                         offset_lblk, split_flag,
5580                                         EXT4_EX_NOCACHE |
5581                                         EXT4_GET_BLOCKS_PRE_IO |
5582                                         EXT4_GET_BLOCKS_METADATA_NOFAIL);
5583                 }
5584 
5585                 ext4_free_ext_path(path);
5586                 if (ret < 0) {
5587                         up_write(&EXT4_I(inode)->i_data_sem);
5588                         goto out_stop;
5589                 }
5590         } else {
5591                 ext4_free_ext_path(path);
5592         }
5593 
5594         ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
5595 
5596         /*
5597          * if offset_lblk lies in a hole which is at start of file, use
5598          * ee_start_lblk to shift extents
5599          */
5600         ret = ext4_ext_shift_extents(inode, handle,
5601                 max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
5602 
5603         up_write(&EXT4_I(inode)->i_data_sem);
5604         if (IS_SYNC(inode))
5605                 ext4_handle_sync(handle);
5606         if (ret >= 0)
5607                 ext4_update_inode_fsync_trans(handle, inode, 1);
5608 
5609 out_stop:
5610         ext4_journal_stop(handle);
5611 out_mmap:
5612         filemap_invalidate_unlock(mapping);
5613 out_mutex:
5614         inode_unlock(inode);
5615         return ret;
5616 }
5617 
5618 /**
5619  * ext4_swap_extents() - Swap extents between two inodes
5620  * @handle: handle for this transaction
5621  * @inode1:     First inode
5622  * @inode2:     Second inode
5623  * @lblk1:      Start block for first inode
5624  * @lblk2:      Start block for second inode
5625  * @count:      Number of blocks to swap
5626  * @unwritten: Mark second inode's extents as unwritten after swap
5627  * @erp:        Pointer to save error value
5628  *
5629  * This helper routine does exactly what is promise "swap extents". All other
5630  * stuff such as page-cache locking consistency, bh mapping consistency or
5631  * extent's data copying must be performed by caller.
5632  * Locking:
5633  *              i_rwsem is held for both inodes
5634  *              i_data_sem is locked for write for both inodes
5635  * Assumptions:
5636  *              All pages from requested range are locked for both inodes
5637  */
5638 int
5639 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5640                   struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5641                   ext4_lblk_t count, int unwritten, int *erp)
5642 {
5643         struct ext4_ext_path *path1 = NULL;
5644         struct ext4_ext_path *path2 = NULL;
5645         int replaced_count = 0;
5646 
5647         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5648         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5649         BUG_ON(!inode_is_locked(inode1));
5650         BUG_ON(!inode_is_locked(inode2));
5651 
5652         ext4_es_remove_extent(inode1, lblk1, count);
5653         ext4_es_remove_extent(inode2, lblk2, count);
5654 
5655         while (count) {
5656                 struct ext4_extent *ex1, *ex2, tmp_ex;
5657                 ext4_lblk_t e1_blk, e2_blk;
5658                 int e1_len, e2_len, len;
5659                 int split = 0;
5660 
5661                 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5662                 if (IS_ERR(path1)) {
5663                         *erp = PTR_ERR(path1);
5664                         path1 = NULL;
5665                 finish:
5666                         count = 0;
5667                         goto repeat;
5668                 }
5669                 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5670                 if (IS_ERR(path2)) {
5671                         *erp = PTR_ERR(path2);
5672                         path2 = NULL;
5673                         goto finish;
5674                 }
5675                 ex1 = path1[path1->p_depth].p_ext;
5676                 ex2 = path2[path2->p_depth].p_ext;
5677                 /* Do we have something to swap ? */
5678                 if (unlikely(!ex2 || !ex1))
5679                         goto finish;
5680 
5681                 e1_blk = le32_to_cpu(ex1->ee_block);
5682                 e2_blk = le32_to_cpu(ex2->ee_block);
5683                 e1_len = ext4_ext_get_actual_len(ex1);
5684                 e2_len = ext4_ext_get_actual_len(ex2);
5685 
5686                 /* Hole handling */
5687                 if (!in_range(lblk1, e1_blk, e1_len) ||
5688                     !in_range(lblk2, e2_blk, e2_len)) {
5689                         ext4_lblk_t next1, next2;
5690 
5691                         /* if hole after extent, then go to next extent */
5692                         next1 = ext4_ext_next_allocated_block(path1);
5693                         next2 = ext4_ext_next_allocated_block(path2);
5694                         /* If hole before extent, then shift to that extent */
5695                         if (e1_blk > lblk1)
5696                                 next1 = e1_blk;
5697                         if (e2_blk > lblk2)
5698                                 next2 = e2_blk;
5699                         /* Do we have something to swap */
5700                         if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5701                                 goto finish;
5702                         /* Move to the rightest boundary */
5703                         len = next1 - lblk1;
5704                         if (len < next2 - lblk2)
5705                                 len = next2 - lblk2;
5706                         if (len > count)
5707                                 len = count;
5708                         lblk1 += len;
5709                         lblk2 += len;
5710                         count -= len;
5711                         goto repeat;
5712                 }
5713 
5714                 /* Prepare left boundary */
5715                 if (e1_blk < lblk1) {
5716                         split = 1;
5717                         *erp = ext4_force_split_extent_at(handle, inode1,
5718                                                 &path1, lblk1, 0);
5719                         if (unlikely(*erp))
5720                                 goto finish;
5721                 }
5722                 if (e2_blk < lblk2) {
5723                         split = 1;
5724                         *erp = ext4_force_split_extent_at(handle, inode2,
5725                                                 &path2,  lblk2, 0);
5726                         if (unlikely(*erp))
5727                                 goto finish;
5728                 }
5729                 /* ext4_split_extent_at() may result in leaf extent split,
5730                  * path must to be revalidated. */
5731                 if (split)
5732                         goto repeat;
5733 
5734                 /* Prepare right boundary */
5735                 len = count;
5736                 if (len > e1_blk + e1_len - lblk1)
5737                         len = e1_blk + e1_len - lblk1;
5738                 if (len > e2_blk + e2_len - lblk2)
5739                         len = e2_blk + e2_len - lblk2;
5740 
5741                 if (len != e1_len) {
5742                         split = 1;
5743                         *erp = ext4_force_split_extent_at(handle, inode1,
5744                                                 &path1, lblk1 + len, 0);
5745                         if (unlikely(*erp))
5746                                 goto finish;
5747                 }
5748                 if (len != e2_len) {
5749                         split = 1;
5750                         *erp = ext4_force_split_extent_at(handle, inode2,
5751                                                 &path2, lblk2 + len, 0);
5752                         if (*erp)
5753                                 goto finish;
5754                 }
5755                 /* ext4_split_extent_at() may result in leaf extent split,
5756                  * path must to be revalidated. */
5757                 if (split)
5758                         goto repeat;
5759 
5760                 BUG_ON(e2_len != e1_len);
5761                 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5762                 if (unlikely(*erp))
5763                         goto finish;
5764                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5765                 if (unlikely(*erp))
5766                         goto finish;
5767 
5768                 /* Both extents are fully inside boundaries. Swap it now */
5769                 tmp_ex = *ex1;
5770                 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5771                 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5772                 ex1->ee_len = cpu_to_le16(e2_len);
5773                 ex2->ee_len = cpu_to_le16(e1_len);
5774                 if (unwritten)
5775                         ext4_ext_mark_unwritten(ex2);
5776                 if (ext4_ext_is_unwritten(&tmp_ex))
5777                         ext4_ext_mark_unwritten(ex1);
5778 
5779                 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5780                 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5781                 *erp = ext4_ext_dirty(handle, inode2, path2 +
5782                                       path2->p_depth);
5783                 if (unlikely(*erp))
5784                         goto finish;
5785                 *erp = ext4_ext_dirty(handle, inode1, path1 +
5786                                       path1->p_depth);
5787                 /*
5788                  * Looks scarry ah..? second inode already points to new blocks,
5789                  * and it was successfully dirtied. But luckily error may happen
5790                  * only due to journal error, so full transaction will be
5791                  * aborted anyway.
5792                  */
5793                 if (unlikely(*erp))
5794                         goto finish;
5795                 lblk1 += len;
5796                 lblk2 += len;
5797                 replaced_count += len;
5798                 count -= len;
5799 
5800         repeat:
5801                 ext4_free_ext_path(path1);
5802                 ext4_free_ext_path(path2);
5803                 path1 = path2 = NULL;
5804         }
5805         return replaced_count;
5806 }
5807 
5808 /*
5809  * ext4_clu_mapped - determine whether any block in a logical cluster has
5810  *                   been mapped to a physical cluster
5811  *
5812  * @inode - file containing the logical cluster
5813  * @lclu - logical cluster of interest
5814  *
5815  * Returns 1 if any block in the logical cluster is mapped, signifying
5816  * that a physical cluster has been allocated for it.  Otherwise,
5817  * returns 0.  Can also return negative error codes.  Derived from
5818  * ext4_ext_map_blocks().
5819  */
5820 int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5821 {
5822         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5823         struct ext4_ext_path *path;
5824         int depth, mapped = 0, err = 0;
5825         struct ext4_extent *extent;
5826         ext4_lblk_t first_lblk, first_lclu, last_lclu;
5827 
5828         /*
5829          * if data can be stored inline, the logical cluster isn't
5830          * mapped - no physical clusters have been allocated, and the
5831          * file has no extents
5832          */
5833         if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
5834             ext4_has_inline_data(inode))
5835                 return 0;
5836 
5837         /* search for the extent closest to the first block in the cluster */
5838         path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5839         if (IS_ERR(path)) {
5840                 err = PTR_ERR(path);
5841                 path = NULL;
5842                 goto out;
5843         }
5844 
5845         depth = ext_depth(inode);
5846 
5847         /*
5848          * A consistent leaf must not be empty.  This situation is possible,
5849          * though, _during_ tree modification, and it's why an assert can't
5850          * be put in ext4_find_extent().
5851          */
5852         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5853                 EXT4_ERROR_INODE(inode,
5854                     "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5855                                  (unsigned long) EXT4_C2B(sbi, lclu),
5856                                  depth, path[depth].p_block);
5857                 err = -EFSCORRUPTED;
5858                 goto out;
5859         }
5860 
5861         extent = path[depth].p_ext;
5862 
5863         /* can't be mapped if the extent tree is empty */
5864         if (extent == NULL)
5865                 goto out;
5866 
5867         first_lblk = le32_to_cpu(extent->ee_block);
5868         first_lclu = EXT4_B2C(sbi, first_lblk);
5869 
5870         /*
5871          * Three possible outcomes at this point - found extent spanning
5872          * the target cluster, to the left of the target cluster, or to the
5873          * right of the target cluster.  The first two cases are handled here.
5874          * The last case indicates the target cluster is not mapped.
5875          */
5876         if (lclu >= first_lclu) {
5877                 last_lclu = EXT4_B2C(sbi, first_lblk +
5878                                      ext4_ext_get_actual_len(extent) - 1);
5879                 if (lclu <= last_lclu) {
5880                         mapped = 1;
5881                 } else {
5882                         first_lblk = ext4_ext_next_allocated_block(path);
5883                         first_lclu = EXT4_B2C(sbi, first_lblk);
5884                         if (lclu == first_lclu)
5885                                 mapped = 1;
5886                 }
5887         }
5888 
5889 out:
5890         ext4_free_ext_path(path);
5891 
5892         return err ? err : mapped;
5893 }
5894 
5895 /*
5896  * Updates physical block address and unwritten status of extent
5897  * starting at lblk start and of len. If such an extent doesn't exist,
5898  * this function splits the extent tree appropriately to create an
5899  * extent like this.  This function is called in the fast commit
5900  * replay path.  Returns 0 on success and error on failure.
5901  */
5902 int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
5903                               int len, int unwritten, ext4_fsblk_t pblk)
5904 {
5905         struct ext4_ext_path *path;
5906         struct ext4_extent *ex;
5907         int ret;
5908 
5909         path = ext4_find_extent(inode, start, NULL, 0);
5910         if (IS_ERR(path))
5911                 return PTR_ERR(path);
5912         ex = path[path->p_depth].p_ext;
5913         if (!ex) {
5914                 ret = -EFSCORRUPTED;
5915                 goto out;
5916         }
5917 
5918         if (le32_to_cpu(ex->ee_block) != start ||
5919                 ext4_ext_get_actual_len(ex) != len) {
5920                 /* We need to split this extent to match our extent first */
5921                 down_write(&EXT4_I(inode)->i_data_sem);
5922                 ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1);
5923                 up_write(&EXT4_I(inode)->i_data_sem);
5924                 if (ret)
5925                         goto out;
5926 
5927                 path = ext4_find_extent(inode, start, &path, 0);
5928                 if (IS_ERR(path))
5929                         return PTR_ERR(path);
5930                 ex = path[path->p_depth].p_ext;
5931                 WARN_ON(le32_to_cpu(ex->ee_block) != start);
5932 
5933                 if (ext4_ext_get_actual_len(ex) != len) {
5934                         down_write(&EXT4_I(inode)->i_data_sem);
5935                         ret = ext4_force_split_extent_at(NULL, inode, &path,
5936                                                          start + len, 1);
5937                         up_write(&EXT4_I(inode)->i_data_sem);
5938                         if (ret)
5939                                 goto out;
5940 
5941                         path = ext4_find_extent(inode, start, &path, 0);
5942                         if (IS_ERR(path))
5943                                 return PTR_ERR(path);
5944                         ex = path[path->p_depth].p_ext;
5945                 }
5946         }
5947         if (unwritten)
5948                 ext4_ext_mark_unwritten(ex);
5949         else
5950                 ext4_ext_mark_initialized(ex);
5951         ext4_ext_store_pblock(ex, pblk);
5952         down_write(&EXT4_I(inode)->i_data_sem);
5953         ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
5954         up_write(&EXT4_I(inode)->i_data_sem);
5955 out:
5956         ext4_free_ext_path(path);
5957         ext4_mark_inode_dirty(NULL, inode);
5958         return ret;
5959 }
5960 
5961 /* Try to shrink the extent tree */
5962 void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
5963 {
5964         struct ext4_ext_path *path = NULL;
5965         struct ext4_extent *ex;
5966         ext4_lblk_t old_cur, cur = 0;
5967 
5968         while (cur < end) {
5969                 path = ext4_find_extent(inode, cur, NULL, 0);
5970                 if (IS_ERR(path))
5971                         return;
5972                 ex = path[path->p_depth].p_ext;
5973                 if (!ex) {
5974                         ext4_free_ext_path(path);
5975                         ext4_mark_inode_dirty(NULL, inode);
5976                         return;
5977                 }
5978                 old_cur = cur;
5979                 cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
5980                 if (cur <= old_cur)
5981                         cur = old_cur + 1;
5982                 ext4_ext_try_to_merge(NULL, inode, path, ex);
5983                 down_write(&EXT4_I(inode)->i_data_sem);
5984                 ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
5985                 up_write(&EXT4_I(inode)->i_data_sem);
5986                 ext4_mark_inode_dirty(NULL, inode);
5987                 ext4_free_ext_path(path);
5988         }
5989 }
5990 
5991 /* Check if *cur is a hole and if it is, skip it */
5992 static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
5993 {
5994         int ret;
5995         struct ext4_map_blocks map;
5996 
5997         map.m_lblk = *cur;
5998         map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
5999 
6000         ret = ext4_map_blocks(NULL, inode, &map, 0);
6001         if (ret < 0)
6002                 return ret;
6003         if (ret != 0)
6004                 return 0;
6005         *cur = *cur + map.m_len;
6006         return 0;
6007 }
6008 
6009 /* Count number of blocks used by this inode and update i_blocks */
6010 int ext4_ext_replay_set_iblocks(struct inode *inode)
6011 {
6012         struct ext4_ext_path *path = NULL, *path2 = NULL;
6013         struct ext4_extent *ex;
6014         ext4_lblk_t cur = 0, end;
6015         int numblks = 0, i, ret = 0;
6016         ext4_fsblk_t cmp1, cmp2;
6017         struct ext4_map_blocks map;
6018 
6019         /* Determin the size of the file first */
6020         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
6021                                         EXT4_EX_NOCACHE);
6022         if (IS_ERR(path))
6023                 return PTR_ERR(path);
6024         ex = path[path->p_depth].p_ext;
6025         if (!ex) {
6026                 ext4_free_ext_path(path);
6027                 goto out;
6028         }
6029         end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
6030         ext4_free_ext_path(path);
6031 
6032         /* Count the number of data blocks */
6033         cur = 0;
6034         while (cur < end) {
6035                 map.m_lblk = cur;
6036                 map.m_len = end - cur;
6037                 ret = ext4_map_blocks(NULL, inode, &map, 0);
6038                 if (ret < 0)
6039                         break;
6040                 if (ret > 0)
6041                         numblks += ret;
6042                 cur = cur + map.m_len;
6043         }
6044 
6045         /*
6046          * Count the number of extent tree blocks. We do it by looking up
6047          * two successive extents and determining the difference between
6048          * their paths. When path is different for 2 successive extents
6049          * we compare the blocks in the path at each level and increment
6050          * iblocks by total number of differences found.
6051          */
6052         cur = 0;
6053         ret = skip_hole(inode, &cur);
6054         if (ret < 0)
6055                 goto out;
6056         path = ext4_find_extent(inode, cur, NULL, 0);
6057         if (IS_ERR(path))
6058                 goto out;
6059         numblks += path->p_depth;
6060         ext4_free_ext_path(path);
6061         while (cur < end) {
6062                 path = ext4_find_extent(inode, cur, NULL, 0);
6063                 if (IS_ERR(path))
6064                         break;
6065                 ex = path[path->p_depth].p_ext;
6066                 if (!ex) {
6067                         ext4_free_ext_path(path);
6068                         return 0;
6069                 }
6070                 cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
6071                                         ext4_ext_get_actual_len(ex));
6072                 ret = skip_hole(inode, &cur);
6073                 if (ret < 0) {
6074                         ext4_free_ext_path(path);
6075                         break;
6076                 }
6077                 path2 = ext4_find_extent(inode, cur, NULL, 0);
6078                 if (IS_ERR(path2)) {
6079                         ext4_free_ext_path(path);
6080                         break;
6081                 }
6082                 for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
6083                         cmp1 = cmp2 = 0;
6084                         if (i <= path->p_depth)
6085                                 cmp1 = path[i].p_bh ?
6086                                         path[i].p_bh->b_blocknr : 0;
6087                         if (i <= path2->p_depth)
6088                                 cmp2 = path2[i].p_bh ?
6089                                         path2[i].p_bh->b_blocknr : 0;
6090                         if (cmp1 != cmp2 && cmp2 != 0)
6091                                 numblks++;
6092                 }
6093                 ext4_free_ext_path(path);
6094                 ext4_free_ext_path(path2);
6095         }
6096 
6097 out:
6098         inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
6099         ext4_mark_inode_dirty(NULL, inode);
6100         return 0;
6101 }
6102 
6103 int ext4_ext_clear_bb(struct inode *inode)
6104 {
6105         struct ext4_ext_path *path = NULL;
6106         struct ext4_extent *ex;
6107         ext4_lblk_t cur = 0, end;
6108         int j, ret = 0;
6109         struct ext4_map_blocks map;
6110 
6111         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
6112                 return 0;
6113 
6114         /* Determin the size of the file first */
6115         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
6116                                         EXT4_EX_NOCACHE);
6117         if (IS_ERR(path))
6118                 return PTR_ERR(path);
6119         ex = path[path->p_depth].p_ext;
6120         if (!ex) {
6121                 ext4_free_ext_path(path);
6122                 return 0;
6123         }
6124         end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
6125         ext4_free_ext_path(path);
6126 
6127         cur = 0;
6128         while (cur < end) {
6129                 map.m_lblk = cur;
6130                 map.m_len = end - cur;
6131                 ret = ext4_map_blocks(NULL, inode, &map, 0);
6132                 if (ret < 0)
6133                         break;
6134                 if (ret > 0) {
6135                         path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
6136                         if (!IS_ERR_OR_NULL(path)) {
6137                                 for (j = 0; j < path->p_depth; j++) {
6138 
6139                                         ext4_mb_mark_bb(inode->i_sb,
6140                                                         path[j].p_block, 1, false);
6141                                         ext4_fc_record_regions(inode->i_sb, inode->i_ino,
6142                                                         0, path[j].p_block, 1, 1);
6143                                 }
6144                                 ext4_free_ext_path(path);
6145                         }
6146                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
6147                         ext4_fc_record_regions(inode->i_sb, inode->i_ino,
6148                                         map.m_lblk, map.m_pblk, map.m_len, 1);
6149                 }
6150                 cur = cur + map.m_len;
6151         }
6152 
6153         return 0;
6154 }
6155 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php