1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 INIT_LIST_HEAD(&ei->i_fc_dilist); 203 init_waitqueue_head(&ei->i_fc_wait); 204 atomic_set(&ei->i_fc_updates, 0); 205 } 206 207 /* This function must be called with sbi->s_fc_lock held. */ 208 static void ext4_fc_wait_committing_inode(struct inode *inode) 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 210 { 211 wait_queue_head_t *wq; 212 struct ext4_inode_info *ei = EXT4_I(inode); 213 214 #if (BITS_PER_LONG < 64) 215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 216 EXT4_STATE_FC_COMMITTING); 217 wq = bit_waitqueue(&ei->i_state_flags, 218 EXT4_STATE_FC_COMMITTING); 219 #else 220 DEFINE_WAIT_BIT(wait, &ei->i_flags, 221 EXT4_STATE_FC_COMMITTING); 222 wq = bit_waitqueue(&ei->i_flags, 223 EXT4_STATE_FC_COMMITTING); 224 #endif 225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 228 schedule(); 229 finish_wait(wq, &wait.wq_entry); 230 } 231 232 static bool ext4_fc_disabled(struct super_block *sb) 233 { 234 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 235 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 236 } 237 238 /* 239 * Inform Ext4's fast about start of an inode update 240 * 241 * This function is called by the high level call VFS callbacks before 242 * performing any inode update. This function blocks if there's an ongoing 243 * fast commit on the inode in question. 244 */ 245 void ext4_fc_start_update(struct inode *inode) 246 { 247 struct ext4_inode_info *ei = EXT4_I(inode); 248 249 if (ext4_fc_disabled(inode->i_sb)) 250 return; 251 252 restart: 253 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 254 if (list_empty(&ei->i_fc_list)) 255 goto out; 256 257 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 258 ext4_fc_wait_committing_inode(inode); 259 goto restart; 260 } 261 out: 262 atomic_inc(&ei->i_fc_updates); 263 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 264 } 265 266 /* 267 * Stop inode update and wake up waiting fast commits if any. 268 */ 269 void ext4_fc_stop_update(struct inode *inode) 270 { 271 struct ext4_inode_info *ei = EXT4_I(inode); 272 273 if (ext4_fc_disabled(inode->i_sb)) 274 return; 275 276 if (atomic_dec_and_test(&ei->i_fc_updates)) 277 wake_up_all(&ei->i_fc_wait); 278 } 279 280 /* 281 * Remove inode from fast commit list. If the inode is being committed 282 * we wait until inode commit is done. 283 */ 284 void ext4_fc_del(struct inode *inode) 285 { 286 struct ext4_inode_info *ei = EXT4_I(inode); 287 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 288 struct ext4_fc_dentry_update *fc_dentry; 289 290 if (ext4_fc_disabled(inode->i_sb)) 291 return; 292 293 restart: 294 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 295 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 296 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 297 return; 298 } 299 300 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 301 ext4_fc_wait_committing_inode(inode); 302 goto restart; 303 } 304 305 if (!list_empty(&ei->i_fc_list)) 306 list_del_init(&ei->i_fc_list); 307 308 /* 309 * Since this inode is getting removed, let's also remove all FC 310 * dentry create references, since it is not needed to log it anyways. 311 */ 312 if (list_empty(&ei->i_fc_dilist)) { 313 spin_unlock(&sbi->s_fc_lock); 314 return; 315 } 316 317 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 318 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 319 list_del_init(&fc_dentry->fcd_list); 320 list_del_init(&fc_dentry->fcd_dilist); 321 322 WARN_ON(!list_empty(&ei->i_fc_dilist)); 323 spin_unlock(&sbi->s_fc_lock); 324 325 if (fc_dentry->fcd_name.name && 326 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 327 kfree(fc_dentry->fcd_name.name); 328 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 329 330 return; 331 } 332 333 /* 334 * Mark file system as fast commit ineligible, and record latest 335 * ineligible transaction tid. This means until the recorded 336 * transaction, commit operation would result in a full jbd2 commit. 337 */ 338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 339 { 340 struct ext4_sb_info *sbi = EXT4_SB(sb); 341 tid_t tid; 342 bool has_transaction = true; 343 bool is_ineligible; 344 345 if (ext4_fc_disabled(sb)) 346 return; 347 348 if (handle && !IS_ERR(handle)) 349 tid = handle->h_transaction->t_tid; 350 else { 351 read_lock(&sbi->s_journal->j_state_lock); 352 if (sbi->s_journal->j_running_transaction) 353 tid = sbi->s_journal->j_running_transaction->t_tid; 354 else 355 has_transaction = false; 356 read_unlock(&sbi->s_journal->j_state_lock); 357 } 358 spin_lock(&sbi->s_fc_lock); 359 is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 360 if (has_transaction && 361 (!is_ineligible || 362 (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid)))) 363 sbi->s_fc_ineligible_tid = tid; 364 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 365 spin_unlock(&sbi->s_fc_lock); 366 WARN_ON(reason >= EXT4_FC_REASON_MAX); 367 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 368 } 369 370 /* 371 * Generic fast commit tracking function. If this is the first time this we are 372 * called after a full commit, we initialize fast commit fields and then call 373 * __fc_track_fn() with update = 0. If we have already been called after a full 374 * commit, we pass update = 1. Based on that, the track function can determine 375 * if it needs to track a field for the first time or if it needs to just 376 * update the previously tracked value. 377 * 378 * If enqueue is set, this function enqueues the inode in fast commit list. 379 */ 380 static int ext4_fc_track_template( 381 handle_t *handle, struct inode *inode, 382 int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), 383 void *args, int enqueue) 384 { 385 bool update = false; 386 struct ext4_inode_info *ei = EXT4_I(inode); 387 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 388 tid_t tid = 0; 389 int ret; 390 391 tid = handle->h_transaction->t_tid; 392 mutex_lock(&ei->i_fc_lock); 393 if (tid == ei->i_sync_tid) { 394 update = true; 395 } else { 396 ext4_fc_reset_inode(inode); 397 ei->i_sync_tid = tid; 398 } 399 ret = __fc_track_fn(handle, inode, args, update); 400 mutex_unlock(&ei->i_fc_lock); 401 402 if (!enqueue) 403 return ret; 404 405 spin_lock(&sbi->s_fc_lock); 406 if (list_empty(&EXT4_I(inode)->i_fc_list)) 407 list_add_tail(&EXT4_I(inode)->i_fc_list, 408 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 409 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 410 &sbi->s_fc_q[FC_Q_STAGING] : 411 &sbi->s_fc_q[FC_Q_MAIN]); 412 spin_unlock(&sbi->s_fc_lock); 413 414 return ret; 415 } 416 417 struct __track_dentry_update_args { 418 struct dentry *dentry; 419 int op; 420 }; 421 422 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 423 static int __track_dentry_update(handle_t *handle, struct inode *inode, 424 void *arg, bool update) 425 { 426 struct ext4_fc_dentry_update *node; 427 struct ext4_inode_info *ei = EXT4_I(inode); 428 struct __track_dentry_update_args *dentry_update = 429 (struct __track_dentry_update_args *)arg; 430 struct dentry *dentry = dentry_update->dentry; 431 struct inode *dir = dentry->d_parent->d_inode; 432 struct super_block *sb = inode->i_sb; 433 struct ext4_sb_info *sbi = EXT4_SB(sb); 434 435 mutex_unlock(&ei->i_fc_lock); 436 437 if (IS_ENCRYPTED(dir)) { 438 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, 439 handle); 440 mutex_lock(&ei->i_fc_lock); 441 return -EOPNOTSUPP; 442 } 443 444 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 445 if (!node) { 446 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); 447 mutex_lock(&ei->i_fc_lock); 448 return -ENOMEM; 449 } 450 451 node->fcd_op = dentry_update->op; 452 node->fcd_parent = dir->i_ino; 453 node->fcd_ino = inode->i_ino; 454 if (dentry->d_name.len > DNAME_INLINE_LEN) { 455 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 456 if (!node->fcd_name.name) { 457 kmem_cache_free(ext4_fc_dentry_cachep, node); 458 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); 459 mutex_lock(&ei->i_fc_lock); 460 return -ENOMEM; 461 } 462 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 463 dentry->d_name.len); 464 } else { 465 memcpy(node->fcd_iname, dentry->d_name.name, 466 dentry->d_name.len); 467 node->fcd_name.name = node->fcd_iname; 468 } 469 node->fcd_name.len = dentry->d_name.len; 470 INIT_LIST_HEAD(&node->fcd_dilist); 471 spin_lock(&sbi->s_fc_lock); 472 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 473 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 474 list_add_tail(&node->fcd_list, 475 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 476 else 477 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 478 479 /* 480 * This helps us keep a track of all fc_dentry updates which is part of 481 * this ext4 inode. So in case the inode is getting unlinked, before 482 * even we get a chance to fsync, we could remove all fc_dentry 483 * references while evicting the inode in ext4_fc_del(). 484 * Also with this, we don't need to loop over all the inodes in 485 * sbi->s_fc_q to get the corresponding inode in 486 * ext4_fc_commit_dentry_updates(). 487 */ 488 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 489 WARN_ON(!list_empty(&ei->i_fc_dilist)); 490 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 491 } 492 spin_unlock(&sbi->s_fc_lock); 493 mutex_lock(&ei->i_fc_lock); 494 495 return 0; 496 } 497 498 void __ext4_fc_track_unlink(handle_t *handle, 499 struct inode *inode, struct dentry *dentry) 500 { 501 struct __track_dentry_update_args args; 502 int ret; 503 504 args.dentry = dentry; 505 args.op = EXT4_FC_TAG_UNLINK; 506 507 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 508 (void *)&args, 0); 509 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 510 } 511 512 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 513 { 514 struct inode *inode = d_inode(dentry); 515 516 if (ext4_fc_disabled(inode->i_sb)) 517 return; 518 519 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 520 return; 521 522 __ext4_fc_track_unlink(handle, inode, dentry); 523 } 524 525 void __ext4_fc_track_link(handle_t *handle, 526 struct inode *inode, struct dentry *dentry) 527 { 528 struct __track_dentry_update_args args; 529 int ret; 530 531 args.dentry = dentry; 532 args.op = EXT4_FC_TAG_LINK; 533 534 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 535 (void *)&args, 0); 536 trace_ext4_fc_track_link(handle, inode, dentry, ret); 537 } 538 539 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 540 { 541 struct inode *inode = d_inode(dentry); 542 543 if (ext4_fc_disabled(inode->i_sb)) 544 return; 545 546 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 547 return; 548 549 __ext4_fc_track_link(handle, inode, dentry); 550 } 551 552 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 553 struct dentry *dentry) 554 { 555 struct __track_dentry_update_args args; 556 int ret; 557 558 args.dentry = dentry; 559 args.op = EXT4_FC_TAG_CREAT; 560 561 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 562 (void *)&args, 0); 563 trace_ext4_fc_track_create(handle, inode, dentry, ret); 564 } 565 566 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 567 { 568 struct inode *inode = d_inode(dentry); 569 570 if (ext4_fc_disabled(inode->i_sb)) 571 return; 572 573 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 574 return; 575 576 __ext4_fc_track_create(handle, inode, dentry); 577 } 578 579 /* __track_fn for inode tracking */ 580 static int __track_inode(handle_t *handle, struct inode *inode, void *arg, 581 bool update) 582 { 583 if (update) 584 return -EEXIST; 585 586 EXT4_I(inode)->i_fc_lblk_len = 0; 587 588 return 0; 589 } 590 591 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 592 { 593 int ret; 594 595 if (S_ISDIR(inode->i_mode)) 596 return; 597 598 if (ext4_fc_disabled(inode->i_sb)) 599 return; 600 601 if (ext4_should_journal_data(inode)) { 602 ext4_fc_mark_ineligible(inode->i_sb, 603 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 604 return; 605 } 606 607 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 608 return; 609 610 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 611 trace_ext4_fc_track_inode(handle, inode, ret); 612 } 613 614 struct __track_range_args { 615 ext4_lblk_t start, end; 616 }; 617 618 /* __track_fn for tracking data updates */ 619 static int __track_range(handle_t *handle, struct inode *inode, void *arg, 620 bool update) 621 { 622 struct ext4_inode_info *ei = EXT4_I(inode); 623 ext4_lblk_t oldstart; 624 struct __track_range_args *__arg = 625 (struct __track_range_args *)arg; 626 627 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 628 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 629 return -ECANCELED; 630 } 631 632 oldstart = ei->i_fc_lblk_start; 633 634 if (update && ei->i_fc_lblk_len > 0) { 635 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 636 ei->i_fc_lblk_len = 637 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 638 ei->i_fc_lblk_start + 1; 639 } else { 640 ei->i_fc_lblk_start = __arg->start; 641 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 642 } 643 644 return 0; 645 } 646 647 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 648 ext4_lblk_t end) 649 { 650 struct __track_range_args args; 651 int ret; 652 653 if (S_ISDIR(inode->i_mode)) 654 return; 655 656 if (ext4_fc_disabled(inode->i_sb)) 657 return; 658 659 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 660 return; 661 662 if (ext4_has_inline_data(inode)) { 663 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, 664 handle); 665 return; 666 } 667 668 args.start = start; 669 args.end = end; 670 671 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 672 673 trace_ext4_fc_track_range(handle, inode, start, end, ret); 674 } 675 676 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 677 { 678 blk_opf_t write_flags = REQ_SYNC; 679 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 680 681 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 682 if (test_opt(sb, BARRIER) && is_tail) 683 write_flags |= REQ_FUA | REQ_PREFLUSH; 684 lock_buffer(bh); 685 set_buffer_dirty(bh); 686 set_buffer_uptodate(bh); 687 bh->b_end_io = ext4_end_buffer_io_sync; 688 submit_bh(REQ_OP_WRITE | write_flags, bh); 689 EXT4_SB(sb)->s_fc_bh = NULL; 690 } 691 692 /* Ext4 commit path routines */ 693 694 /* 695 * Allocate len bytes on a fast commit buffer. 696 * 697 * During the commit time this function is used to manage fast commit 698 * block space. We don't split a fast commit log onto different 699 * blocks. So this function makes sure that if there's not enough space 700 * on the current block, the remaining space in the current block is 701 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 702 * new block is from jbd2 and CRC is updated to reflect the padding 703 * we added. 704 */ 705 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 706 { 707 struct ext4_fc_tl tl; 708 struct ext4_sb_info *sbi = EXT4_SB(sb); 709 struct buffer_head *bh; 710 int bsize = sbi->s_journal->j_blocksize; 711 int ret, off = sbi->s_fc_bytes % bsize; 712 int remaining; 713 u8 *dst; 714 715 /* 716 * If 'len' is too long to fit in any block alongside a PAD tlv, then we 717 * cannot fulfill the request. 718 */ 719 if (len > bsize - EXT4_FC_TAG_BASE_LEN) 720 return NULL; 721 722 if (!sbi->s_fc_bh) { 723 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 724 if (ret) 725 return NULL; 726 sbi->s_fc_bh = bh; 727 } 728 dst = sbi->s_fc_bh->b_data + off; 729 730 /* 731 * Allocate the bytes in the current block if we can do so while still 732 * leaving enough space for a PAD tlv. 733 */ 734 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; 735 if (len <= remaining) { 736 sbi->s_fc_bytes += len; 737 return dst; 738 } 739 740 /* 741 * Else, terminate the current block with a PAD tlv, then allocate a new 742 * block and allocate the bytes at the start of that new block. 743 */ 744 745 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 746 tl.fc_len = cpu_to_le16(remaining); 747 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 748 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); 749 *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); 750 751 ext4_fc_submit_bh(sb, false); 752 753 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 754 if (ret) 755 return NULL; 756 sbi->s_fc_bh = bh; 757 sbi->s_fc_bytes += bsize - off + len; 758 return sbi->s_fc_bh->b_data; 759 } 760 761 /* 762 * Complete a fast commit by writing tail tag. 763 * 764 * Writing tail tag marks the end of a fast commit. In order to guarantee 765 * atomicity, after writing tail tag, even if there's space remaining 766 * in the block, next commit shouldn't use it. That's why tail tag 767 * has the length as that of the remaining space on the block. 768 */ 769 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 770 { 771 struct ext4_sb_info *sbi = EXT4_SB(sb); 772 struct ext4_fc_tl tl; 773 struct ext4_fc_tail tail; 774 int off, bsize = sbi->s_journal->j_blocksize; 775 u8 *dst; 776 777 /* 778 * ext4_fc_reserve_space takes care of allocating an extra block if 779 * there's no enough space on this block for accommodating this tail. 780 */ 781 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); 782 if (!dst) 783 return -ENOSPC; 784 785 off = sbi->s_fc_bytes % bsize; 786 787 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 788 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); 789 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 790 791 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 792 dst += EXT4_FC_TAG_BASE_LEN; 793 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 794 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); 795 dst += sizeof(tail.fc_tid); 796 crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, 797 dst - (u8 *)sbi->s_fc_bh->b_data); 798 tail.fc_crc = cpu_to_le32(crc); 799 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); 800 dst += sizeof(tail.fc_crc); 801 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ 802 803 ext4_fc_submit_bh(sb, true); 804 805 return 0; 806 } 807 808 /* 809 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 810 * Returns false if there's not enough space. 811 */ 812 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 813 u32 *crc) 814 { 815 struct ext4_fc_tl tl; 816 u8 *dst; 817 818 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); 819 if (!dst) 820 return false; 821 822 tl.fc_tag = cpu_to_le16(tag); 823 tl.fc_len = cpu_to_le16(len); 824 825 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 826 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); 827 828 return true; 829 } 830 831 /* Same as above, but adds dentry tlv. */ 832 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 833 struct ext4_fc_dentry_update *fc_dentry) 834 { 835 struct ext4_fc_dentry_info fcd; 836 struct ext4_fc_tl tl; 837 int dlen = fc_dentry->fcd_name.len; 838 u8 *dst = ext4_fc_reserve_space(sb, 839 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); 840 841 if (!dst) 842 return false; 843 844 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 845 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 846 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 847 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 848 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 849 dst += EXT4_FC_TAG_BASE_LEN; 850 memcpy(dst, &fcd, sizeof(fcd)); 851 dst += sizeof(fcd); 852 memcpy(dst, fc_dentry->fcd_name.name, dlen); 853 854 return true; 855 } 856 857 /* 858 * Writes inode in the fast commit space under TLV with tag @tag. 859 * Returns 0 on success, error on failure. 860 */ 861 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 862 { 863 struct ext4_inode_info *ei = EXT4_I(inode); 864 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 865 int ret; 866 struct ext4_iloc iloc; 867 struct ext4_fc_inode fc_inode; 868 struct ext4_fc_tl tl; 869 u8 *dst; 870 871 ret = ext4_get_inode_loc(inode, &iloc); 872 if (ret) 873 return ret; 874 875 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 876 inode_len = EXT4_INODE_SIZE(inode->i_sb); 877 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 878 inode_len += ei->i_extra_isize; 879 880 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 881 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 882 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 883 884 ret = -ECANCELED; 885 dst = ext4_fc_reserve_space(inode->i_sb, 886 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); 887 if (!dst) 888 goto err; 889 890 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 891 dst += EXT4_FC_TAG_BASE_LEN; 892 memcpy(dst, &fc_inode, sizeof(fc_inode)); 893 dst += sizeof(fc_inode); 894 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); 895 ret = 0; 896 err: 897 brelse(iloc.bh); 898 return ret; 899 } 900 901 /* 902 * Writes updated data ranges for the inode in question. Updates CRC. 903 * Returns 0 on success, error otherwise. 904 */ 905 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 906 { 907 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 908 struct ext4_inode_info *ei = EXT4_I(inode); 909 struct ext4_map_blocks map; 910 struct ext4_fc_add_range fc_ext; 911 struct ext4_fc_del_range lrange; 912 struct ext4_extent *ex; 913 int ret; 914 915 mutex_lock(&ei->i_fc_lock); 916 if (ei->i_fc_lblk_len == 0) { 917 mutex_unlock(&ei->i_fc_lock); 918 return 0; 919 } 920 old_blk_size = ei->i_fc_lblk_start; 921 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 922 ei->i_fc_lblk_len = 0; 923 mutex_unlock(&ei->i_fc_lock); 924 925 cur_lblk_off = old_blk_size; 926 ext4_debug("will try writing %d to %d for inode %ld\n", 927 cur_lblk_off, new_blk_size, inode->i_ino); 928 929 while (cur_lblk_off <= new_blk_size) { 930 map.m_lblk = cur_lblk_off; 931 map.m_len = new_blk_size - cur_lblk_off + 1; 932 ret = ext4_map_blocks(NULL, inode, &map, 0); 933 if (ret < 0) 934 return -ECANCELED; 935 936 if (map.m_len == 0) { 937 cur_lblk_off++; 938 continue; 939 } 940 941 if (ret == 0) { 942 lrange.fc_ino = cpu_to_le32(inode->i_ino); 943 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 944 lrange.fc_len = cpu_to_le32(map.m_len); 945 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 946 sizeof(lrange), (u8 *)&lrange, crc)) 947 return -ENOSPC; 948 } else { 949 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 950 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 951 952 /* Limit the number of blocks in one extent */ 953 map.m_len = min(max, map.m_len); 954 955 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 956 ex = (struct ext4_extent *)&fc_ext.fc_ex; 957 ex->ee_block = cpu_to_le32(map.m_lblk); 958 ex->ee_len = cpu_to_le16(map.m_len); 959 ext4_ext_store_pblock(ex, map.m_pblk); 960 if (map.m_flags & EXT4_MAP_UNWRITTEN) 961 ext4_ext_mark_unwritten(ex); 962 else 963 ext4_ext_mark_initialized(ex); 964 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 965 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 966 return -ENOSPC; 967 } 968 969 cur_lblk_off += map.m_len; 970 } 971 972 return 0; 973 } 974 975 976 /* Submit data for all the fast commit inodes */ 977 static int ext4_fc_submit_inode_data_all(journal_t *journal) 978 { 979 struct super_block *sb = journal->j_private; 980 struct ext4_sb_info *sbi = EXT4_SB(sb); 981 struct ext4_inode_info *ei; 982 int ret = 0; 983 984 spin_lock(&sbi->s_fc_lock); 985 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 986 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 987 while (atomic_read(&ei->i_fc_updates)) { 988 DEFINE_WAIT(wait); 989 990 prepare_to_wait(&ei->i_fc_wait, &wait, 991 TASK_UNINTERRUPTIBLE); 992 if (atomic_read(&ei->i_fc_updates)) { 993 spin_unlock(&sbi->s_fc_lock); 994 schedule(); 995 spin_lock(&sbi->s_fc_lock); 996 } 997 finish_wait(&ei->i_fc_wait, &wait); 998 } 999 spin_unlock(&sbi->s_fc_lock); 1000 ret = jbd2_submit_inode_data(journal, ei->jinode); 1001 if (ret) 1002 return ret; 1003 spin_lock(&sbi->s_fc_lock); 1004 } 1005 spin_unlock(&sbi->s_fc_lock); 1006 1007 return ret; 1008 } 1009 1010 /* Wait for completion of data for all the fast commit inodes */ 1011 static int ext4_fc_wait_inode_data_all(journal_t *journal) 1012 { 1013 struct super_block *sb = journal->j_private; 1014 struct ext4_sb_info *sbi = EXT4_SB(sb); 1015 struct ext4_inode_info *pos, *n; 1016 int ret = 0; 1017 1018 spin_lock(&sbi->s_fc_lock); 1019 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1020 if (!ext4_test_inode_state(&pos->vfs_inode, 1021 EXT4_STATE_FC_COMMITTING)) 1022 continue; 1023 spin_unlock(&sbi->s_fc_lock); 1024 1025 ret = jbd2_wait_inode_data(journal, pos->jinode); 1026 if (ret) 1027 return ret; 1028 spin_lock(&sbi->s_fc_lock); 1029 } 1030 spin_unlock(&sbi->s_fc_lock); 1031 1032 return 0; 1033 } 1034 1035 /* Commit all the directory entry updates */ 1036 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1037 __acquires(&sbi->s_fc_lock) 1038 __releases(&sbi->s_fc_lock) 1039 { 1040 struct super_block *sb = journal->j_private; 1041 struct ext4_sb_info *sbi = EXT4_SB(sb); 1042 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1043 struct inode *inode; 1044 struct ext4_inode_info *ei; 1045 int ret; 1046 1047 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1048 return 0; 1049 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1050 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1051 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1052 spin_unlock(&sbi->s_fc_lock); 1053 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1054 ret = -ENOSPC; 1055 goto lock_and_exit; 1056 } 1057 spin_lock(&sbi->s_fc_lock); 1058 continue; 1059 } 1060 /* 1061 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1062 * corresponding inode pointer 1063 */ 1064 WARN_ON(list_empty(&fc_dentry->fcd_dilist)); 1065 ei = list_first_entry(&fc_dentry->fcd_dilist, 1066 struct ext4_inode_info, i_fc_dilist); 1067 inode = &ei->vfs_inode; 1068 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1069 1070 spin_unlock(&sbi->s_fc_lock); 1071 1072 /* 1073 * We first write the inode and then the create dirent. This 1074 * allows the recovery code to create an unnamed inode first 1075 * and then link it to a directory entry. This allows us 1076 * to use namei.c routines almost as is and simplifies 1077 * the recovery code. 1078 */ 1079 ret = ext4_fc_write_inode(inode, crc); 1080 if (ret) 1081 goto lock_and_exit; 1082 1083 ret = ext4_fc_write_inode_data(inode, crc); 1084 if (ret) 1085 goto lock_and_exit; 1086 1087 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1088 ret = -ENOSPC; 1089 goto lock_and_exit; 1090 } 1091 1092 spin_lock(&sbi->s_fc_lock); 1093 } 1094 return 0; 1095 lock_and_exit: 1096 spin_lock(&sbi->s_fc_lock); 1097 return ret; 1098 } 1099 1100 static int ext4_fc_perform_commit(journal_t *journal) 1101 { 1102 struct super_block *sb = journal->j_private; 1103 struct ext4_sb_info *sbi = EXT4_SB(sb); 1104 struct ext4_inode_info *iter; 1105 struct ext4_fc_head head; 1106 struct inode *inode; 1107 struct blk_plug plug; 1108 int ret = 0; 1109 u32 crc = 0; 1110 1111 ret = ext4_fc_submit_inode_data_all(journal); 1112 if (ret) 1113 return ret; 1114 1115 ret = ext4_fc_wait_inode_data_all(journal); 1116 if (ret) 1117 return ret; 1118 1119 /* 1120 * If file system device is different from journal device, issue a cache 1121 * flush before we start writing fast commit blocks. 1122 */ 1123 if (journal->j_fs_dev != journal->j_dev) 1124 blkdev_issue_flush(journal->j_fs_dev); 1125 1126 blk_start_plug(&plug); 1127 if (sbi->s_fc_bytes == 0) { 1128 /* 1129 * Add a head tag only if this is the first fast commit 1130 * in this TID. 1131 */ 1132 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1133 head.fc_tid = cpu_to_le32( 1134 sbi->s_journal->j_running_transaction->t_tid); 1135 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1136 (u8 *)&head, &crc)) { 1137 ret = -ENOSPC; 1138 goto out; 1139 } 1140 } 1141 1142 spin_lock(&sbi->s_fc_lock); 1143 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1144 if (ret) { 1145 spin_unlock(&sbi->s_fc_lock); 1146 goto out; 1147 } 1148 1149 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1150 inode = &iter->vfs_inode; 1151 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1152 continue; 1153 1154 spin_unlock(&sbi->s_fc_lock); 1155 ret = ext4_fc_write_inode_data(inode, &crc); 1156 if (ret) 1157 goto out; 1158 ret = ext4_fc_write_inode(inode, &crc); 1159 if (ret) 1160 goto out; 1161 spin_lock(&sbi->s_fc_lock); 1162 } 1163 spin_unlock(&sbi->s_fc_lock); 1164 1165 ret = ext4_fc_write_tail(sb, crc); 1166 1167 out: 1168 blk_finish_plug(&plug); 1169 return ret; 1170 } 1171 1172 static void ext4_fc_update_stats(struct super_block *sb, int status, 1173 u64 commit_time, int nblks, tid_t commit_tid) 1174 { 1175 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1176 1177 ext4_debug("Fast commit ended with status = %d for tid %u", 1178 status, commit_tid); 1179 if (status == EXT4_FC_STATUS_OK) { 1180 stats->fc_num_commits++; 1181 stats->fc_numblks += nblks; 1182 if (likely(stats->s_fc_avg_commit_time)) 1183 stats->s_fc_avg_commit_time = 1184 (commit_time + 1185 stats->s_fc_avg_commit_time * 3) / 4; 1186 else 1187 stats->s_fc_avg_commit_time = commit_time; 1188 } else if (status == EXT4_FC_STATUS_FAILED || 1189 status == EXT4_FC_STATUS_INELIGIBLE) { 1190 if (status == EXT4_FC_STATUS_FAILED) 1191 stats->fc_failed_commits++; 1192 stats->fc_ineligible_commits++; 1193 } else { 1194 stats->fc_skipped_commits++; 1195 } 1196 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1197 } 1198 1199 /* 1200 * The main commit entry point. Performs a fast commit for transaction 1201 * commit_tid if needed. If it's not possible to perform a fast commit 1202 * due to various reasons, we fall back to full commit. Returns 0 1203 * on success, error otherwise. 1204 */ 1205 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1206 { 1207 struct super_block *sb = journal->j_private; 1208 struct ext4_sb_info *sbi = EXT4_SB(sb); 1209 int nblks = 0, ret, bsize = journal->j_blocksize; 1210 int subtid = atomic_read(&sbi->s_fc_subtid); 1211 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1212 ktime_t start_time, commit_time; 1213 1214 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1215 return jbd2_complete_transaction(journal, commit_tid); 1216 1217 trace_ext4_fc_commit_start(sb, commit_tid); 1218 1219 start_time = ktime_get(); 1220 1221 restart_fc: 1222 ret = jbd2_fc_begin_commit(journal, commit_tid); 1223 if (ret == -EALREADY) { 1224 /* There was an ongoing commit, check if we need to restart */ 1225 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1226 tid_gt(commit_tid, journal->j_commit_sequence)) 1227 goto restart_fc; 1228 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1229 commit_tid); 1230 return 0; 1231 } else if (ret) { 1232 /* 1233 * Commit couldn't start. Just update stats and perform a 1234 * full commit. 1235 */ 1236 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1237 commit_tid); 1238 return jbd2_complete_transaction(journal, commit_tid); 1239 } 1240 1241 /* 1242 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1243 * if we are fast commit ineligible. 1244 */ 1245 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1246 status = EXT4_FC_STATUS_INELIGIBLE; 1247 goto fallback; 1248 } 1249 1250 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1251 ret = ext4_fc_perform_commit(journal); 1252 if (ret < 0) { 1253 status = EXT4_FC_STATUS_FAILED; 1254 goto fallback; 1255 } 1256 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1257 ret = jbd2_fc_wait_bufs(journal, nblks); 1258 if (ret < 0) { 1259 status = EXT4_FC_STATUS_FAILED; 1260 goto fallback; 1261 } 1262 atomic_inc(&sbi->s_fc_subtid); 1263 ret = jbd2_fc_end_commit(journal); 1264 /* 1265 * weight the commit time higher than the average time so we 1266 * don't react too strongly to vast changes in the commit time 1267 */ 1268 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1269 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1270 return ret; 1271 1272 fallback: 1273 ret = jbd2_fc_end_commit_fallback(journal); 1274 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1275 return ret; 1276 } 1277 1278 /* 1279 * Fast commit cleanup routine. This is called after every fast commit and 1280 * full commit. full is true if we are called after a full commit. 1281 */ 1282 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1283 { 1284 struct super_block *sb = journal->j_private; 1285 struct ext4_sb_info *sbi = EXT4_SB(sb); 1286 struct ext4_inode_info *iter, *iter_n; 1287 struct ext4_fc_dentry_update *fc_dentry; 1288 1289 if (full && sbi->s_fc_bh) 1290 sbi->s_fc_bh = NULL; 1291 1292 trace_ext4_fc_cleanup(journal, full, tid); 1293 jbd2_fc_release_bufs(journal); 1294 1295 spin_lock(&sbi->s_fc_lock); 1296 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1297 i_fc_list) { 1298 list_del_init(&iter->i_fc_list); 1299 ext4_clear_inode_state(&iter->vfs_inode, 1300 EXT4_STATE_FC_COMMITTING); 1301 if (tid_geq(tid, iter->i_sync_tid)) { 1302 ext4_fc_reset_inode(&iter->vfs_inode); 1303 } else if (full) { 1304 /* 1305 * We are called after a full commit, inode has been 1306 * modified while the commit was running. Re-enqueue 1307 * the inode into STAGING, which will then be splice 1308 * back into MAIN. This cannot happen during 1309 * fastcommit because the journal is locked all the 1310 * time in that case (and tid doesn't increase so 1311 * tid check above isn't reliable). 1312 */ 1313 list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, 1314 &sbi->s_fc_q[FC_Q_STAGING]); 1315 } 1316 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1317 smp_mb(); 1318 #if (BITS_PER_LONG < 64) 1319 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1320 #else 1321 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1322 #endif 1323 } 1324 1325 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1326 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1327 struct ext4_fc_dentry_update, 1328 fcd_list); 1329 list_del_init(&fc_dentry->fcd_list); 1330 list_del_init(&fc_dentry->fcd_dilist); 1331 spin_unlock(&sbi->s_fc_lock); 1332 1333 if (fc_dentry->fcd_name.name && 1334 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1335 kfree(fc_dentry->fcd_name.name); 1336 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1337 spin_lock(&sbi->s_fc_lock); 1338 } 1339 1340 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1341 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1342 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1343 &sbi->s_fc_q[FC_Q_MAIN]); 1344 1345 if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { 1346 sbi->s_fc_ineligible_tid = 0; 1347 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1348 } 1349 1350 if (full) 1351 sbi->s_fc_bytes = 0; 1352 spin_unlock(&sbi->s_fc_lock); 1353 trace_ext4_fc_stats(sb); 1354 } 1355 1356 /* Ext4 Replay Path Routines */ 1357 1358 /* Helper struct for dentry replay routines */ 1359 struct dentry_info_args { 1360 int parent_ino, dname_len, ino, inode_len; 1361 char *dname; 1362 }; 1363 1364 /* Same as struct ext4_fc_tl, but uses native endianness fields */ 1365 struct ext4_fc_tl_mem { 1366 u16 fc_tag; 1367 u16 fc_len; 1368 }; 1369 1370 static inline void tl_to_darg(struct dentry_info_args *darg, 1371 struct ext4_fc_tl_mem *tl, u8 *val) 1372 { 1373 struct ext4_fc_dentry_info fcd; 1374 1375 memcpy(&fcd, val, sizeof(fcd)); 1376 1377 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1378 darg->ino = le32_to_cpu(fcd.fc_ino); 1379 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1380 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1381 } 1382 1383 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) 1384 { 1385 struct ext4_fc_tl tl_disk; 1386 1387 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); 1388 tl->fc_len = le16_to_cpu(tl_disk.fc_len); 1389 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); 1390 } 1391 1392 /* Unlink replay function */ 1393 static int ext4_fc_replay_unlink(struct super_block *sb, 1394 struct ext4_fc_tl_mem *tl, u8 *val) 1395 { 1396 struct inode *inode, *old_parent; 1397 struct qstr entry; 1398 struct dentry_info_args darg; 1399 int ret = 0; 1400 1401 tl_to_darg(&darg, tl, val); 1402 1403 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1404 darg.parent_ino, darg.dname_len); 1405 1406 entry.name = darg.dname; 1407 entry.len = darg.dname_len; 1408 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1409 1410 if (IS_ERR(inode)) { 1411 ext4_debug("Inode %d not found", darg.ino); 1412 return 0; 1413 } 1414 1415 old_parent = ext4_iget(sb, darg.parent_ino, 1416 EXT4_IGET_NORMAL); 1417 if (IS_ERR(old_parent)) { 1418 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1419 iput(inode); 1420 return 0; 1421 } 1422 1423 ret = __ext4_unlink(old_parent, &entry, inode, NULL); 1424 /* -ENOENT ok coz it might not exist anymore. */ 1425 if (ret == -ENOENT) 1426 ret = 0; 1427 iput(old_parent); 1428 iput(inode); 1429 return ret; 1430 } 1431 1432 static int ext4_fc_replay_link_internal(struct super_block *sb, 1433 struct dentry_info_args *darg, 1434 struct inode *inode) 1435 { 1436 struct inode *dir = NULL; 1437 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1438 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1439 int ret = 0; 1440 1441 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1442 if (IS_ERR(dir)) { 1443 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1444 dir = NULL; 1445 goto out; 1446 } 1447 1448 dentry_dir = d_obtain_alias(dir); 1449 if (IS_ERR(dentry_dir)) { 1450 ext4_debug("Failed to obtain dentry"); 1451 dentry_dir = NULL; 1452 goto out; 1453 } 1454 1455 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1456 if (!dentry_inode) { 1457 ext4_debug("Inode dentry not created."); 1458 ret = -ENOMEM; 1459 goto out; 1460 } 1461 1462 ret = __ext4_link(dir, inode, dentry_inode); 1463 /* 1464 * It's possible that link already existed since data blocks 1465 * for the dir in question got persisted before we crashed OR 1466 * we replayed this tag and crashed before the entire replay 1467 * could complete. 1468 */ 1469 if (ret && ret != -EEXIST) { 1470 ext4_debug("Failed to link\n"); 1471 goto out; 1472 } 1473 1474 ret = 0; 1475 out: 1476 if (dentry_dir) { 1477 d_drop(dentry_dir); 1478 dput(dentry_dir); 1479 } else if (dir) { 1480 iput(dir); 1481 } 1482 if (dentry_inode) { 1483 d_drop(dentry_inode); 1484 dput(dentry_inode); 1485 } 1486 1487 return ret; 1488 } 1489 1490 /* Link replay function */ 1491 static int ext4_fc_replay_link(struct super_block *sb, 1492 struct ext4_fc_tl_mem *tl, u8 *val) 1493 { 1494 struct inode *inode; 1495 struct dentry_info_args darg; 1496 int ret = 0; 1497 1498 tl_to_darg(&darg, tl, val); 1499 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1500 darg.parent_ino, darg.dname_len); 1501 1502 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1503 if (IS_ERR(inode)) { 1504 ext4_debug("Inode not found."); 1505 return 0; 1506 } 1507 1508 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1509 iput(inode); 1510 return ret; 1511 } 1512 1513 /* 1514 * Record all the modified inodes during replay. We use this later to setup 1515 * block bitmaps correctly. 1516 */ 1517 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1518 { 1519 struct ext4_fc_replay_state *state; 1520 int i; 1521 1522 state = &EXT4_SB(sb)->s_fc_replay_state; 1523 for (i = 0; i < state->fc_modified_inodes_used; i++) 1524 if (state->fc_modified_inodes[i] == ino) 1525 return 0; 1526 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1527 int *fc_modified_inodes; 1528 1529 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1530 sizeof(int) * (state->fc_modified_inodes_size + 1531 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1532 GFP_KERNEL); 1533 if (!fc_modified_inodes) 1534 return -ENOMEM; 1535 state->fc_modified_inodes = fc_modified_inodes; 1536 state->fc_modified_inodes_size += 1537 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1538 } 1539 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1540 return 0; 1541 } 1542 1543 /* 1544 * Inode replay function 1545 */ 1546 static int ext4_fc_replay_inode(struct super_block *sb, 1547 struct ext4_fc_tl_mem *tl, u8 *val) 1548 { 1549 struct ext4_fc_inode fc_inode; 1550 struct ext4_inode *raw_inode; 1551 struct ext4_inode *raw_fc_inode; 1552 struct inode *inode = NULL; 1553 struct ext4_iloc iloc; 1554 int inode_len, ino, ret, tag = tl->fc_tag; 1555 struct ext4_extent_header *eh; 1556 size_t off_gen = offsetof(struct ext4_inode, i_generation); 1557 1558 memcpy(&fc_inode, val, sizeof(fc_inode)); 1559 1560 ino = le32_to_cpu(fc_inode.fc_ino); 1561 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1562 1563 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1564 if (!IS_ERR(inode)) { 1565 ext4_ext_clear_bb(inode); 1566 iput(inode); 1567 } 1568 inode = NULL; 1569 1570 ret = ext4_fc_record_modified_inode(sb, ino); 1571 if (ret) 1572 goto out; 1573 1574 raw_fc_inode = (struct ext4_inode *) 1575 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1576 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1577 if (ret) 1578 goto out; 1579 1580 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); 1581 raw_inode = ext4_raw_inode(&iloc); 1582 1583 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1584 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, 1585 inode_len - off_gen); 1586 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1587 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1588 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1589 memset(eh, 0, sizeof(*eh)); 1590 eh->eh_magic = EXT4_EXT_MAGIC; 1591 eh->eh_max = cpu_to_le16( 1592 (sizeof(raw_inode->i_block) - 1593 sizeof(struct ext4_extent_header)) 1594 / sizeof(struct ext4_extent)); 1595 } 1596 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1597 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1598 sizeof(raw_inode->i_block)); 1599 } 1600 1601 /* Immediately update the inode on disk. */ 1602 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1603 if (ret) 1604 goto out; 1605 ret = sync_dirty_buffer(iloc.bh); 1606 if (ret) 1607 goto out; 1608 ret = ext4_mark_inode_used(sb, ino); 1609 if (ret) 1610 goto out; 1611 1612 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1613 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1614 if (IS_ERR(inode)) { 1615 ext4_debug("Inode not found."); 1616 return -EFSCORRUPTED; 1617 } 1618 1619 /* 1620 * Our allocator could have made different decisions than before 1621 * crashing. This should be fixed but until then, we calculate 1622 * the number of blocks the inode. 1623 */ 1624 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1625 ext4_ext_replay_set_iblocks(inode); 1626 1627 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1628 ext4_reset_inode_seed(inode); 1629 1630 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1631 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1632 sync_dirty_buffer(iloc.bh); 1633 brelse(iloc.bh); 1634 out: 1635 iput(inode); 1636 if (!ret) 1637 blkdev_issue_flush(sb->s_bdev); 1638 1639 return 0; 1640 } 1641 1642 /* 1643 * Dentry create replay function. 1644 * 1645 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1646 * inode for which we are trying to create a dentry here, should already have 1647 * been replayed before we start here. 1648 */ 1649 static int ext4_fc_replay_create(struct super_block *sb, 1650 struct ext4_fc_tl_mem *tl, u8 *val) 1651 { 1652 int ret = 0; 1653 struct inode *inode = NULL; 1654 struct inode *dir = NULL; 1655 struct dentry_info_args darg; 1656 1657 tl_to_darg(&darg, tl, val); 1658 1659 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1660 darg.parent_ino, darg.dname_len); 1661 1662 /* This takes care of update group descriptor and other metadata */ 1663 ret = ext4_mark_inode_used(sb, darg.ino); 1664 if (ret) 1665 goto out; 1666 1667 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1668 if (IS_ERR(inode)) { 1669 ext4_debug("inode %d not found.", darg.ino); 1670 inode = NULL; 1671 ret = -EINVAL; 1672 goto out; 1673 } 1674 1675 if (S_ISDIR(inode->i_mode)) { 1676 /* 1677 * If we are creating a directory, we need to make sure that the 1678 * dot and dot dot dirents are setup properly. 1679 */ 1680 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1681 if (IS_ERR(dir)) { 1682 ext4_debug("Dir %d not found.", darg.ino); 1683 goto out; 1684 } 1685 ret = ext4_init_new_dir(NULL, dir, inode); 1686 iput(dir); 1687 if (ret) { 1688 ret = 0; 1689 goto out; 1690 } 1691 } 1692 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1693 if (ret) 1694 goto out; 1695 set_nlink(inode, 1); 1696 ext4_mark_inode_dirty(NULL, inode); 1697 out: 1698 iput(inode); 1699 return ret; 1700 } 1701 1702 /* 1703 * Record physical disk regions which are in use as per fast commit area, 1704 * and used by inodes during replay phase. Our simple replay phase 1705 * allocator excludes these regions from allocation. 1706 */ 1707 int ext4_fc_record_regions(struct super_block *sb, int ino, 1708 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1709 { 1710 struct ext4_fc_replay_state *state; 1711 struct ext4_fc_alloc_region *region; 1712 1713 state = &EXT4_SB(sb)->s_fc_replay_state; 1714 /* 1715 * during replay phase, the fc_regions_valid may not same as 1716 * fc_regions_used, update it when do new additions. 1717 */ 1718 if (replay && state->fc_regions_used != state->fc_regions_valid) 1719 state->fc_regions_used = state->fc_regions_valid; 1720 if (state->fc_regions_used == state->fc_regions_size) { 1721 struct ext4_fc_alloc_region *fc_regions; 1722 1723 fc_regions = krealloc(state->fc_regions, 1724 sizeof(struct ext4_fc_alloc_region) * 1725 (state->fc_regions_size + 1726 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1727 GFP_KERNEL); 1728 if (!fc_regions) 1729 return -ENOMEM; 1730 state->fc_regions_size += 1731 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1732 state->fc_regions = fc_regions; 1733 } 1734 region = &state->fc_regions[state->fc_regions_used++]; 1735 region->ino = ino; 1736 region->lblk = lblk; 1737 region->pblk = pblk; 1738 region->len = len; 1739 1740 if (replay) 1741 state->fc_regions_valid++; 1742 1743 return 0; 1744 } 1745 1746 /* Replay add range tag */ 1747 static int ext4_fc_replay_add_range(struct super_block *sb, 1748 struct ext4_fc_tl_mem *tl, u8 *val) 1749 { 1750 struct ext4_fc_add_range fc_add_ex; 1751 struct ext4_extent newex, *ex; 1752 struct inode *inode; 1753 ext4_lblk_t start, cur; 1754 int remaining, len; 1755 ext4_fsblk_t start_pblk; 1756 struct ext4_map_blocks map; 1757 struct ext4_ext_path *path = NULL; 1758 int ret; 1759 1760 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1761 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1762 1763 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1764 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1765 ext4_ext_get_actual_len(ex)); 1766 1767 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1768 if (IS_ERR(inode)) { 1769 ext4_debug("Inode not found."); 1770 return 0; 1771 } 1772 1773 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1774 if (ret) 1775 goto out; 1776 1777 start = le32_to_cpu(ex->ee_block); 1778 start_pblk = ext4_ext_pblock(ex); 1779 len = ext4_ext_get_actual_len(ex); 1780 1781 cur = start; 1782 remaining = len; 1783 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1784 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1785 inode->i_ino); 1786 1787 while (remaining > 0) { 1788 map.m_lblk = cur; 1789 map.m_len = remaining; 1790 map.m_pblk = 0; 1791 ret = ext4_map_blocks(NULL, inode, &map, 0); 1792 1793 if (ret < 0) 1794 goto out; 1795 1796 if (ret == 0) { 1797 /* Range is not mapped */ 1798 path = ext4_find_extent(inode, cur, NULL, 0); 1799 if (IS_ERR(path)) 1800 goto out; 1801 memset(&newex, 0, sizeof(newex)); 1802 newex.ee_block = cpu_to_le32(cur); 1803 ext4_ext_store_pblock( 1804 &newex, start_pblk + cur - start); 1805 newex.ee_len = cpu_to_le16(map.m_len); 1806 if (ext4_ext_is_unwritten(ex)) 1807 ext4_ext_mark_unwritten(&newex); 1808 down_write(&EXT4_I(inode)->i_data_sem); 1809 ret = ext4_ext_insert_extent( 1810 NULL, inode, &path, &newex, 0); 1811 up_write((&EXT4_I(inode)->i_data_sem)); 1812 ext4_free_ext_path(path); 1813 if (ret) 1814 goto out; 1815 goto next; 1816 } 1817 1818 if (start_pblk + cur - start != map.m_pblk) { 1819 /* 1820 * Logical to physical mapping changed. This can happen 1821 * if this range was removed and then reallocated to 1822 * map to new physical blocks during a fast commit. 1823 */ 1824 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1825 ext4_ext_is_unwritten(ex), 1826 start_pblk + cur - start); 1827 if (ret) 1828 goto out; 1829 /* 1830 * Mark the old blocks as free since they aren't used 1831 * anymore. We maintain an array of all the modified 1832 * inodes. In case these blocks are still used at either 1833 * a different logical range in the same inode or in 1834 * some different inode, we will mark them as allocated 1835 * at the end of the FC replay using our array of 1836 * modified inodes. 1837 */ 1838 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 1839 goto next; 1840 } 1841 1842 /* Range is mapped and needs a state change */ 1843 ext4_debug("Converting from %ld to %d %lld", 1844 map.m_flags & EXT4_MAP_UNWRITTEN, 1845 ext4_ext_is_unwritten(ex), map.m_pblk); 1846 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1847 ext4_ext_is_unwritten(ex), map.m_pblk); 1848 if (ret) 1849 goto out; 1850 /* 1851 * We may have split the extent tree while toggling the state. 1852 * Try to shrink the extent tree now. 1853 */ 1854 ext4_ext_replay_shrink_inode(inode, start + len); 1855 next: 1856 cur += map.m_len; 1857 remaining -= map.m_len; 1858 } 1859 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1860 sb->s_blocksize_bits); 1861 out: 1862 iput(inode); 1863 return 0; 1864 } 1865 1866 /* Replay DEL_RANGE tag */ 1867 static int 1868 ext4_fc_replay_del_range(struct super_block *sb, 1869 struct ext4_fc_tl_mem *tl, u8 *val) 1870 { 1871 struct inode *inode; 1872 struct ext4_fc_del_range lrange; 1873 struct ext4_map_blocks map; 1874 ext4_lblk_t cur, remaining; 1875 int ret; 1876 1877 memcpy(&lrange, val, sizeof(lrange)); 1878 cur = le32_to_cpu(lrange.fc_lblk); 1879 remaining = le32_to_cpu(lrange.fc_len); 1880 1881 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1882 le32_to_cpu(lrange.fc_ino), cur, remaining); 1883 1884 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1885 if (IS_ERR(inode)) { 1886 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1887 return 0; 1888 } 1889 1890 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1891 if (ret) 1892 goto out; 1893 1894 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", 1895 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1896 le32_to_cpu(lrange.fc_len)); 1897 while (remaining > 0) { 1898 map.m_lblk = cur; 1899 map.m_len = remaining; 1900 1901 ret = ext4_map_blocks(NULL, inode, &map, 0); 1902 if (ret < 0) 1903 goto out; 1904 if (ret > 0) { 1905 remaining -= ret; 1906 cur += ret; 1907 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 1908 } else { 1909 remaining -= map.m_len; 1910 cur += map.m_len; 1911 } 1912 } 1913 1914 down_write(&EXT4_I(inode)->i_data_sem); 1915 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1916 le32_to_cpu(lrange.fc_lblk) + 1917 le32_to_cpu(lrange.fc_len) - 1); 1918 up_write(&EXT4_I(inode)->i_data_sem); 1919 if (ret) 1920 goto out; 1921 ext4_ext_replay_shrink_inode(inode, 1922 i_size_read(inode) >> sb->s_blocksize_bits); 1923 ext4_mark_inode_dirty(NULL, inode); 1924 out: 1925 iput(inode); 1926 return 0; 1927 } 1928 1929 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1930 { 1931 struct ext4_fc_replay_state *state; 1932 struct inode *inode; 1933 struct ext4_ext_path *path = NULL; 1934 struct ext4_map_blocks map; 1935 int i, ret, j; 1936 ext4_lblk_t cur, end; 1937 1938 state = &EXT4_SB(sb)->s_fc_replay_state; 1939 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1940 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1941 EXT4_IGET_NORMAL); 1942 if (IS_ERR(inode)) { 1943 ext4_debug("Inode %d not found.", 1944 state->fc_modified_inodes[i]); 1945 continue; 1946 } 1947 cur = 0; 1948 end = EXT_MAX_BLOCKS; 1949 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1950 iput(inode); 1951 continue; 1952 } 1953 while (cur < end) { 1954 map.m_lblk = cur; 1955 map.m_len = end - cur; 1956 1957 ret = ext4_map_blocks(NULL, inode, &map, 0); 1958 if (ret < 0) 1959 break; 1960 1961 if (ret > 0) { 1962 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1963 if (!IS_ERR(path)) { 1964 for (j = 0; j < path->p_depth; j++) 1965 ext4_mb_mark_bb(inode->i_sb, 1966 path[j].p_block, 1, true); 1967 ext4_free_ext_path(path); 1968 } 1969 cur += ret; 1970 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1971 map.m_len, true); 1972 } else { 1973 cur = cur + (map.m_len ? map.m_len : 1); 1974 } 1975 } 1976 iput(inode); 1977 } 1978 } 1979 1980 /* 1981 * Check if block is in excluded regions for block allocation. The simple 1982 * allocator that runs during replay phase is calls this function to see 1983 * if it is okay to use a block. 1984 */ 1985 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1986 { 1987 int i; 1988 struct ext4_fc_replay_state *state; 1989 1990 state = &EXT4_SB(sb)->s_fc_replay_state; 1991 for (i = 0; i < state->fc_regions_valid; i++) { 1992 if (state->fc_regions[i].ino == 0 || 1993 state->fc_regions[i].len == 0) 1994 continue; 1995 if (in_range(blk, state->fc_regions[i].pblk, 1996 state->fc_regions[i].len)) 1997 return true; 1998 } 1999 return false; 2000 } 2001 2002 /* Cleanup function called after replay */ 2003 void ext4_fc_replay_cleanup(struct super_block *sb) 2004 { 2005 struct ext4_sb_info *sbi = EXT4_SB(sb); 2006 2007 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 2008 kfree(sbi->s_fc_replay_state.fc_regions); 2009 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 2010 } 2011 2012 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, 2013 int tag, int len) 2014 { 2015 switch (tag) { 2016 case EXT4_FC_TAG_ADD_RANGE: 2017 return len == sizeof(struct ext4_fc_add_range); 2018 case EXT4_FC_TAG_DEL_RANGE: 2019 return len == sizeof(struct ext4_fc_del_range); 2020 case EXT4_FC_TAG_CREAT: 2021 case EXT4_FC_TAG_LINK: 2022 case EXT4_FC_TAG_UNLINK: 2023 len -= sizeof(struct ext4_fc_dentry_info); 2024 return len >= 1 && len <= EXT4_NAME_LEN; 2025 case EXT4_FC_TAG_INODE: 2026 len -= sizeof(struct ext4_fc_inode); 2027 return len >= EXT4_GOOD_OLD_INODE_SIZE && 2028 len <= sbi->s_inode_size; 2029 case EXT4_FC_TAG_PAD: 2030 return true; /* padding can have any length */ 2031 case EXT4_FC_TAG_TAIL: 2032 return len >= sizeof(struct ext4_fc_tail); 2033 case EXT4_FC_TAG_HEAD: 2034 return len == sizeof(struct ext4_fc_head); 2035 } 2036 return false; 2037 } 2038 2039 /* 2040 * Recovery Scan phase handler 2041 * 2042 * This function is called during the scan phase and is responsible 2043 * for doing following things: 2044 * - Make sure the fast commit area has valid tags for replay 2045 * - Count number of tags that need to be replayed by the replay handler 2046 * - Verify CRC 2047 * - Create a list of excluded blocks for allocation during replay phase 2048 * 2049 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 2050 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 2051 * to indicate that scan has finished and JBD2 can now start replay phase. 2052 * It returns a negative error to indicate that there was an error. At the end 2053 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 2054 * to indicate the number of tags that need to replayed during the replay phase. 2055 */ 2056 static int ext4_fc_replay_scan(journal_t *journal, 2057 struct buffer_head *bh, int off, 2058 tid_t expected_tid) 2059 { 2060 struct super_block *sb = journal->j_private; 2061 struct ext4_sb_info *sbi = EXT4_SB(sb); 2062 struct ext4_fc_replay_state *state; 2063 int ret = JBD2_FC_REPLAY_CONTINUE; 2064 struct ext4_fc_add_range ext; 2065 struct ext4_fc_tl_mem tl; 2066 struct ext4_fc_tail tail; 2067 __u8 *start, *end, *cur, *val; 2068 struct ext4_fc_head head; 2069 struct ext4_extent *ex; 2070 2071 state = &sbi->s_fc_replay_state; 2072 2073 start = (u8 *)bh->b_data; 2074 end = start + journal->j_blocksize; 2075 2076 if (state->fc_replay_expected_off == 0) { 2077 state->fc_cur_tag = 0; 2078 state->fc_replay_num_tags = 0; 2079 state->fc_crc = 0; 2080 state->fc_regions = NULL; 2081 state->fc_regions_valid = state->fc_regions_used = 2082 state->fc_regions_size = 0; 2083 /* Check if we can stop early */ 2084 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2085 != EXT4_FC_TAG_HEAD) 2086 return 0; 2087 } 2088 2089 if (off != state->fc_replay_expected_off) { 2090 ret = -EFSCORRUPTED; 2091 goto out_err; 2092 } 2093 2094 state->fc_replay_expected_off++; 2095 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2096 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2097 ext4_fc_get_tl(&tl, cur); 2098 val = cur + EXT4_FC_TAG_BASE_LEN; 2099 if (tl.fc_len > end - val || 2100 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { 2101 ret = state->fc_replay_num_tags ? 2102 JBD2_FC_REPLAY_STOP : -ECANCELED; 2103 goto out_err; 2104 } 2105 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2106 tag2str(tl.fc_tag), bh->b_blocknr); 2107 switch (tl.fc_tag) { 2108 case EXT4_FC_TAG_ADD_RANGE: 2109 memcpy(&ext, val, sizeof(ext)); 2110 ex = (struct ext4_extent *)&ext.fc_ex; 2111 ret = ext4_fc_record_regions(sb, 2112 le32_to_cpu(ext.fc_ino), 2113 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2114 ext4_ext_get_actual_len(ex), 0); 2115 if (ret < 0) 2116 break; 2117 ret = JBD2_FC_REPLAY_CONTINUE; 2118 fallthrough; 2119 case EXT4_FC_TAG_DEL_RANGE: 2120 case EXT4_FC_TAG_LINK: 2121 case EXT4_FC_TAG_UNLINK: 2122 case EXT4_FC_TAG_CREAT: 2123 case EXT4_FC_TAG_INODE: 2124 case EXT4_FC_TAG_PAD: 2125 state->fc_cur_tag++; 2126 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2127 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2128 break; 2129 case EXT4_FC_TAG_TAIL: 2130 state->fc_cur_tag++; 2131 memcpy(&tail, val, sizeof(tail)); 2132 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2133 EXT4_FC_TAG_BASE_LEN + 2134 offsetof(struct ext4_fc_tail, 2135 fc_crc)); 2136 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2137 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2138 state->fc_replay_num_tags = state->fc_cur_tag; 2139 state->fc_regions_valid = 2140 state->fc_regions_used; 2141 } else { 2142 ret = state->fc_replay_num_tags ? 2143 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2144 } 2145 state->fc_crc = 0; 2146 break; 2147 case EXT4_FC_TAG_HEAD: 2148 memcpy(&head, val, sizeof(head)); 2149 if (le32_to_cpu(head.fc_features) & 2150 ~EXT4_FC_SUPPORTED_FEATURES) { 2151 ret = -EOPNOTSUPP; 2152 break; 2153 } 2154 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2155 ret = JBD2_FC_REPLAY_STOP; 2156 break; 2157 } 2158 state->fc_cur_tag++; 2159 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2160 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2161 break; 2162 default: 2163 ret = state->fc_replay_num_tags ? 2164 JBD2_FC_REPLAY_STOP : -ECANCELED; 2165 } 2166 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2167 break; 2168 } 2169 2170 out_err: 2171 trace_ext4_fc_replay_scan(sb, ret, off); 2172 return ret; 2173 } 2174 2175 /* 2176 * Main recovery path entry point. 2177 * The meaning of return codes is similar as above. 2178 */ 2179 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2180 enum passtype pass, int off, tid_t expected_tid) 2181 { 2182 struct super_block *sb = journal->j_private; 2183 struct ext4_sb_info *sbi = EXT4_SB(sb); 2184 struct ext4_fc_tl_mem tl; 2185 __u8 *start, *end, *cur, *val; 2186 int ret = JBD2_FC_REPLAY_CONTINUE; 2187 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2188 struct ext4_fc_tail tail; 2189 2190 if (pass == PASS_SCAN) { 2191 state->fc_current_pass = PASS_SCAN; 2192 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2193 } 2194 2195 if (state->fc_current_pass != pass) { 2196 state->fc_current_pass = pass; 2197 sbi->s_mount_state |= EXT4_FC_REPLAY; 2198 } 2199 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2200 ext4_debug("Replay stops\n"); 2201 ext4_fc_set_bitmaps_and_counters(sb); 2202 return 0; 2203 } 2204 2205 #ifdef CONFIG_EXT4_DEBUG 2206 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2207 pr_warn("Dropping fc block %d because max_replay set\n", off); 2208 return JBD2_FC_REPLAY_STOP; 2209 } 2210 #endif 2211 2212 start = (u8 *)bh->b_data; 2213 end = start + journal->j_blocksize; 2214 2215 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2216 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2217 ext4_fc_get_tl(&tl, cur); 2218 val = cur + EXT4_FC_TAG_BASE_LEN; 2219 2220 if (state->fc_replay_num_tags == 0) { 2221 ret = JBD2_FC_REPLAY_STOP; 2222 ext4_fc_set_bitmaps_and_counters(sb); 2223 break; 2224 } 2225 2226 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); 2227 state->fc_replay_num_tags--; 2228 switch (tl.fc_tag) { 2229 case EXT4_FC_TAG_LINK: 2230 ret = ext4_fc_replay_link(sb, &tl, val); 2231 break; 2232 case EXT4_FC_TAG_UNLINK: 2233 ret = ext4_fc_replay_unlink(sb, &tl, val); 2234 break; 2235 case EXT4_FC_TAG_ADD_RANGE: 2236 ret = ext4_fc_replay_add_range(sb, &tl, val); 2237 break; 2238 case EXT4_FC_TAG_CREAT: 2239 ret = ext4_fc_replay_create(sb, &tl, val); 2240 break; 2241 case EXT4_FC_TAG_DEL_RANGE: 2242 ret = ext4_fc_replay_del_range(sb, &tl, val); 2243 break; 2244 case EXT4_FC_TAG_INODE: 2245 ret = ext4_fc_replay_inode(sb, &tl, val); 2246 break; 2247 case EXT4_FC_TAG_PAD: 2248 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2249 tl.fc_len, 0); 2250 break; 2251 case EXT4_FC_TAG_TAIL: 2252 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 2253 0, tl.fc_len, 0); 2254 memcpy(&tail, val, sizeof(tail)); 2255 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2256 break; 2257 case EXT4_FC_TAG_HEAD: 2258 break; 2259 default: 2260 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); 2261 ret = -ECANCELED; 2262 break; 2263 } 2264 if (ret < 0) 2265 break; 2266 ret = JBD2_FC_REPLAY_CONTINUE; 2267 } 2268 return ret; 2269 } 2270 2271 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2272 { 2273 /* 2274 * We set replay callback even if fast commit disabled because we may 2275 * could still have fast commit blocks that need to be replayed even if 2276 * fast commit has now been turned off. 2277 */ 2278 journal->j_fc_replay_callback = ext4_fc_replay; 2279 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2280 return; 2281 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2282 } 2283 2284 static const char * const fc_ineligible_reasons[] = { 2285 [EXT4_FC_REASON_XATTR] = "Extended attributes changed", 2286 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", 2287 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", 2288 [EXT4_FC_REASON_NOMEM] = "Insufficient memory", 2289 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", 2290 [EXT4_FC_REASON_RESIZE] = "Resize", 2291 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", 2292 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", 2293 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", 2294 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", 2295 }; 2296 2297 int ext4_fc_info_show(struct seq_file *seq, void *v) 2298 { 2299 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2300 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2301 int i; 2302 2303 if (v != SEQ_START_TOKEN) 2304 return 0; 2305 2306 seq_printf(seq, 2307 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2308 stats->fc_num_commits, stats->fc_ineligible_commits, 2309 stats->fc_numblks, 2310 div_u64(stats->s_fc_avg_commit_time, 1000)); 2311 seq_puts(seq, "Ineligible reasons:\n"); 2312 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2313 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2314 stats->fc_ineligible_reason_count[i]); 2315 2316 return 0; 2317 } 2318 2319 int __init ext4_fc_init_dentry_cache(void) 2320 { 2321 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2322 SLAB_RECLAIM_ACCOUNT); 2323 2324 if (ext4_fc_dentry_cachep == NULL) 2325 return -ENOMEM; 2326 2327 return 0; 2328 } 2329 2330 void ext4_fc_destroy_dentry_cache(void) 2331 { 2332 kmem_cache_destroy(ext4_fc_dentry_cachep); 2333 } 2334
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.