1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_trace.h" 18 #include "xfs_log.h" 19 #include "xfs_log_priv.h" 20 #include "xfs_log_recover.h" 21 #include "xfs_error.h" 22 #include "xfs_inode.h" 23 #include "xfs_dir2.h" 24 #include "xfs_quota.h" 25 26 /* 27 * This is the number of entries in the l_buf_cancel_table used during 28 * recovery. 29 */ 30 #define XLOG_BC_TABLE_SIZE 64 31 32 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ 33 ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) 34 35 /* 36 * This structure is used during recovery to record the buf log items which 37 * have been canceled and should not be replayed. 38 */ 39 struct xfs_buf_cancel { 40 xfs_daddr_t bc_blkno; 41 uint bc_len; 42 int bc_refcount; 43 struct list_head bc_list; 44 }; 45 46 static struct xfs_buf_cancel * 47 xlog_find_buffer_cancelled( 48 struct xlog *log, 49 xfs_daddr_t blkno, 50 uint len) 51 { 52 struct list_head *bucket; 53 struct xfs_buf_cancel *bcp; 54 55 if (!log->l_buf_cancel_table) 56 return NULL; 57 58 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 59 list_for_each_entry(bcp, bucket, bc_list) { 60 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 61 return bcp; 62 } 63 64 return NULL; 65 } 66 67 static bool 68 xlog_add_buffer_cancelled( 69 struct xlog *log, 70 xfs_daddr_t blkno, 71 uint len) 72 { 73 struct xfs_buf_cancel *bcp; 74 75 /* 76 * If we find an existing cancel record, this indicates that the buffer 77 * was cancelled multiple times. To ensure that during pass 2 we keep 78 * the record in the table until we reach its last occurrence in the 79 * log, a reference count is kept to tell how many times we expect to 80 * see this record during the second pass. 81 */ 82 bcp = xlog_find_buffer_cancelled(log, blkno, len); 83 if (bcp) { 84 bcp->bc_refcount++; 85 return false; 86 } 87 88 bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); 89 bcp->bc_blkno = blkno; 90 bcp->bc_len = len; 91 bcp->bc_refcount = 1; 92 list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); 93 return true; 94 } 95 96 /* 97 * Check if there is and entry for blkno, len in the buffer cancel record table. 98 */ 99 bool 100 xlog_is_buffer_cancelled( 101 struct xlog *log, 102 xfs_daddr_t blkno, 103 uint len) 104 { 105 return xlog_find_buffer_cancelled(log, blkno, len) != NULL; 106 } 107 108 /* 109 * Check if there is and entry for blkno, len in the buffer cancel record table, 110 * and decremented the reference count on it if there is one. 111 * 112 * Remove the cancel record once the refcount hits zero, so that if the same 113 * buffer is re-used again after its last cancellation we actually replay the 114 * changes made at that point. 115 */ 116 static bool 117 xlog_put_buffer_cancelled( 118 struct xlog *log, 119 xfs_daddr_t blkno, 120 uint len) 121 { 122 struct xfs_buf_cancel *bcp; 123 124 bcp = xlog_find_buffer_cancelled(log, blkno, len); 125 if (!bcp) { 126 ASSERT(0); 127 return false; 128 } 129 130 if (--bcp->bc_refcount == 0) { 131 list_del(&bcp->bc_list); 132 kfree(bcp); 133 } 134 return true; 135 } 136 137 /* log buffer item recovery */ 138 139 /* 140 * Sort buffer items for log recovery. Most buffer items should end up on the 141 * buffer list and are recovered first, with the following exceptions: 142 * 143 * 1. XFS_BLF_CANCEL buffers must be processed last because some log items 144 * might depend on the incor ecancellation record, and replaying a cancelled 145 * buffer item can remove the incore record. 146 * 147 * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that 148 * we replay di_next_unlinked only after flushing the inode 'free' state 149 * to the inode buffer. 150 * 151 * See xlog_recover_reorder_trans for more details. 152 */ 153 STATIC enum xlog_recover_reorder 154 xlog_recover_buf_reorder( 155 struct xlog_recover_item *item) 156 { 157 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 158 159 if (buf_f->blf_flags & XFS_BLF_CANCEL) 160 return XLOG_REORDER_CANCEL_LIST; 161 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 162 return XLOG_REORDER_INODE_BUFFER_LIST; 163 return XLOG_REORDER_BUFFER_LIST; 164 } 165 166 STATIC void 167 xlog_recover_buf_ra_pass2( 168 struct xlog *log, 169 struct xlog_recover_item *item) 170 { 171 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 172 173 xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); 174 } 175 176 /* 177 * Build up the table of buf cancel records so that we don't replay cancelled 178 * data in the second pass. 179 */ 180 static int 181 xlog_recover_buf_commit_pass1( 182 struct xlog *log, 183 struct xlog_recover_item *item) 184 { 185 struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; 186 187 if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { 188 xfs_err(log->l_mp, "bad buffer log item size (%d)", 189 item->ri_buf[0].i_len); 190 return -EFSCORRUPTED; 191 } 192 193 if (!(bf->blf_flags & XFS_BLF_CANCEL)) 194 trace_xfs_log_recover_buf_not_cancel(log, bf); 195 else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) 196 trace_xfs_log_recover_buf_cancel_add(log, bf); 197 else 198 trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); 199 return 0; 200 } 201 202 /* 203 * Validate the recovered buffer is of the correct type and attach the 204 * appropriate buffer operations to them for writeback. Magic numbers are in a 205 * few places: 206 * the first 16 bits of the buffer (inode buffer, dquot buffer), 207 * the first 32 bits of the buffer (most blocks), 208 * inside a struct xfs_da_blkinfo at the start of the buffer. 209 */ 210 static void 211 xlog_recover_validate_buf_type( 212 struct xfs_mount *mp, 213 struct xfs_buf *bp, 214 struct xfs_buf_log_format *buf_f, 215 xfs_lsn_t current_lsn) 216 { 217 struct xfs_da_blkinfo *info = bp->b_addr; 218 uint32_t magic32; 219 uint16_t magic16; 220 uint16_t magicda; 221 char *warnmsg = NULL; 222 223 /* 224 * We can only do post recovery validation on items on CRC enabled 225 * fielsystems as we need to know when the buffer was written to be able 226 * to determine if we should have replayed the item. If we replay old 227 * metadata over a newer buffer, then it will enter a temporarily 228 * inconsistent state resulting in verification failures. Hence for now 229 * just avoid the verification stage for non-crc filesystems 230 */ 231 if (!xfs_has_crc(mp)) 232 return; 233 234 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 235 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 236 magicda = be16_to_cpu(info->magic); 237 switch (xfs_blft_from_flags(buf_f)) { 238 case XFS_BLFT_BTREE_BUF: 239 switch (magic32) { 240 case XFS_ABTB_CRC_MAGIC: 241 case XFS_ABTB_MAGIC: 242 bp->b_ops = &xfs_bnobt_buf_ops; 243 break; 244 case XFS_ABTC_CRC_MAGIC: 245 case XFS_ABTC_MAGIC: 246 bp->b_ops = &xfs_cntbt_buf_ops; 247 break; 248 case XFS_IBT_CRC_MAGIC: 249 case XFS_IBT_MAGIC: 250 bp->b_ops = &xfs_inobt_buf_ops; 251 break; 252 case XFS_FIBT_CRC_MAGIC: 253 case XFS_FIBT_MAGIC: 254 bp->b_ops = &xfs_finobt_buf_ops; 255 break; 256 case XFS_BMAP_CRC_MAGIC: 257 case XFS_BMAP_MAGIC: 258 bp->b_ops = &xfs_bmbt_buf_ops; 259 break; 260 case XFS_RMAP_CRC_MAGIC: 261 bp->b_ops = &xfs_rmapbt_buf_ops; 262 break; 263 case XFS_REFC_CRC_MAGIC: 264 bp->b_ops = &xfs_refcountbt_buf_ops; 265 break; 266 default: 267 warnmsg = "Bad btree block magic!"; 268 break; 269 } 270 break; 271 case XFS_BLFT_AGF_BUF: 272 if (magic32 != XFS_AGF_MAGIC) { 273 warnmsg = "Bad AGF block magic!"; 274 break; 275 } 276 bp->b_ops = &xfs_agf_buf_ops; 277 break; 278 case XFS_BLFT_AGFL_BUF: 279 if (magic32 != XFS_AGFL_MAGIC) { 280 warnmsg = "Bad AGFL block magic!"; 281 break; 282 } 283 bp->b_ops = &xfs_agfl_buf_ops; 284 break; 285 case XFS_BLFT_AGI_BUF: 286 if (magic32 != XFS_AGI_MAGIC) { 287 warnmsg = "Bad AGI block magic!"; 288 break; 289 } 290 bp->b_ops = &xfs_agi_buf_ops; 291 break; 292 case XFS_BLFT_UDQUOT_BUF: 293 case XFS_BLFT_PDQUOT_BUF: 294 case XFS_BLFT_GDQUOT_BUF: 295 #ifdef CONFIG_XFS_QUOTA 296 if (magic16 != XFS_DQUOT_MAGIC) { 297 warnmsg = "Bad DQUOT block magic!"; 298 break; 299 } 300 bp->b_ops = &xfs_dquot_buf_ops; 301 #else 302 xfs_alert(mp, 303 "Trying to recover dquots without QUOTA support built in!"); 304 ASSERT(0); 305 #endif 306 break; 307 case XFS_BLFT_DINO_BUF: 308 if (magic16 != XFS_DINODE_MAGIC) { 309 warnmsg = "Bad INODE block magic!"; 310 break; 311 } 312 bp->b_ops = &xfs_inode_buf_ops; 313 break; 314 case XFS_BLFT_SYMLINK_BUF: 315 if (magic32 != XFS_SYMLINK_MAGIC) { 316 warnmsg = "Bad symlink block magic!"; 317 break; 318 } 319 bp->b_ops = &xfs_symlink_buf_ops; 320 break; 321 case XFS_BLFT_DIR_BLOCK_BUF: 322 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 323 magic32 != XFS_DIR3_BLOCK_MAGIC) { 324 warnmsg = "Bad dir block magic!"; 325 break; 326 } 327 bp->b_ops = &xfs_dir3_block_buf_ops; 328 break; 329 case XFS_BLFT_DIR_DATA_BUF: 330 if (magic32 != XFS_DIR2_DATA_MAGIC && 331 magic32 != XFS_DIR3_DATA_MAGIC) { 332 warnmsg = "Bad dir data magic!"; 333 break; 334 } 335 bp->b_ops = &xfs_dir3_data_buf_ops; 336 break; 337 case XFS_BLFT_DIR_FREE_BUF: 338 if (magic32 != XFS_DIR2_FREE_MAGIC && 339 magic32 != XFS_DIR3_FREE_MAGIC) { 340 warnmsg = "Bad dir3 free magic!"; 341 break; 342 } 343 bp->b_ops = &xfs_dir3_free_buf_ops; 344 break; 345 case XFS_BLFT_DIR_LEAF1_BUF: 346 if (magicda != XFS_DIR2_LEAF1_MAGIC && 347 magicda != XFS_DIR3_LEAF1_MAGIC) { 348 warnmsg = "Bad dir leaf1 magic!"; 349 break; 350 } 351 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 352 break; 353 case XFS_BLFT_DIR_LEAFN_BUF: 354 if (magicda != XFS_DIR2_LEAFN_MAGIC && 355 magicda != XFS_DIR3_LEAFN_MAGIC) { 356 warnmsg = "Bad dir leafn magic!"; 357 break; 358 } 359 bp->b_ops = &xfs_dir3_leafn_buf_ops; 360 break; 361 case XFS_BLFT_DA_NODE_BUF: 362 if (magicda != XFS_DA_NODE_MAGIC && 363 magicda != XFS_DA3_NODE_MAGIC) { 364 warnmsg = "Bad da node magic!"; 365 break; 366 } 367 bp->b_ops = &xfs_da3_node_buf_ops; 368 break; 369 case XFS_BLFT_ATTR_LEAF_BUF: 370 if (magicda != XFS_ATTR_LEAF_MAGIC && 371 magicda != XFS_ATTR3_LEAF_MAGIC) { 372 warnmsg = "Bad attr leaf magic!"; 373 break; 374 } 375 bp->b_ops = &xfs_attr3_leaf_buf_ops; 376 break; 377 case XFS_BLFT_ATTR_RMT_BUF: 378 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 379 warnmsg = "Bad attr remote magic!"; 380 break; 381 } 382 bp->b_ops = &xfs_attr3_rmt_buf_ops; 383 break; 384 case XFS_BLFT_SB_BUF: 385 if (magic32 != XFS_SB_MAGIC) { 386 warnmsg = "Bad SB block magic!"; 387 break; 388 } 389 bp->b_ops = &xfs_sb_buf_ops; 390 break; 391 #ifdef CONFIG_XFS_RT 392 case XFS_BLFT_RTBITMAP_BUF: 393 case XFS_BLFT_RTSUMMARY_BUF: 394 /* no magic numbers for verification of RT buffers */ 395 bp->b_ops = &xfs_rtbuf_ops; 396 break; 397 #endif /* CONFIG_XFS_RT */ 398 default: 399 xfs_warn(mp, "Unknown buffer type %d!", 400 xfs_blft_from_flags(buf_f)); 401 break; 402 } 403 404 /* 405 * Nothing else to do in the case of a NULL current LSN as this means 406 * the buffer is more recent than the change in the log and will be 407 * skipped. 408 */ 409 if (current_lsn == NULLCOMMITLSN) 410 return; 411 412 if (warnmsg) { 413 xfs_warn(mp, warnmsg); 414 ASSERT(0); 415 } 416 417 /* 418 * We must update the metadata LSN of the buffer as it is written out to 419 * ensure that older transactions never replay over this one and corrupt 420 * the buffer. This can occur if log recovery is interrupted at some 421 * point after the current transaction completes, at which point a 422 * subsequent mount starts recovery from the beginning. 423 * 424 * Write verifiers update the metadata LSN from log items attached to 425 * the buffer. Therefore, initialize a bli purely to carry the LSN to 426 * the verifier. 427 */ 428 if (bp->b_ops) { 429 struct xfs_buf_log_item *bip; 430 431 bp->b_flags |= _XBF_LOGRECOVERY; 432 xfs_buf_item_init(bp, mp); 433 bip = bp->b_log_item; 434 bip->bli_item.li_lsn = current_lsn; 435 } 436 } 437 438 /* 439 * Perform a 'normal' buffer recovery. Each logged region of the 440 * buffer should be copied over the corresponding region in the 441 * given buffer. The bitmap in the buf log format structure indicates 442 * where to place the logged data. 443 */ 444 STATIC void 445 xlog_recover_do_reg_buffer( 446 struct xfs_mount *mp, 447 struct xlog_recover_item *item, 448 struct xfs_buf *bp, 449 struct xfs_buf_log_format *buf_f, 450 xfs_lsn_t current_lsn) 451 { 452 int i; 453 int bit; 454 int nbits; 455 xfs_failaddr_t fa; 456 const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); 457 458 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 459 460 bit = 0; 461 i = 1; /* 0 is the buf format structure */ 462 while (1) { 463 bit = xfs_next_bit(buf_f->blf_data_map, 464 buf_f->blf_map_size, bit); 465 if (bit == -1) 466 break; 467 nbits = xfs_contig_bits(buf_f->blf_data_map, 468 buf_f->blf_map_size, bit); 469 ASSERT(nbits > 0); 470 ASSERT(item->ri_buf[i].i_addr != NULL); 471 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 472 ASSERT(BBTOB(bp->b_length) >= 473 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 474 475 /* 476 * The dirty regions logged in the buffer, even though 477 * contiguous, may span multiple chunks. This is because the 478 * dirty region may span a physical page boundary in a buffer 479 * and hence be split into two separate vectors for writing into 480 * the log. Hence we need to trim nbits back to the length of 481 * the current region being copied out of the log. 482 */ 483 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 484 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 485 486 /* 487 * Do a sanity check if this is a dquot buffer. Just checking 488 * the first dquot in the buffer should do. XXXThis is 489 * probably a good thing to do for other buf types also. 490 */ 491 fa = NULL; 492 if (buf_f->blf_flags & 493 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 494 if (item->ri_buf[i].i_addr == NULL) { 495 xfs_alert(mp, 496 "XFS: NULL dquot in %s.", __func__); 497 goto next; 498 } 499 if (item->ri_buf[i].i_len < size_disk_dquot) { 500 xfs_alert(mp, 501 "XFS: dquot too small (%d) in %s.", 502 item->ri_buf[i].i_len, __func__); 503 goto next; 504 } 505 fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); 506 if (fa) { 507 xfs_alert(mp, 508 "dquot corrupt at %pS trying to replay into block 0x%llx", 509 fa, xfs_buf_daddr(bp)); 510 goto next; 511 } 512 } 513 514 memcpy(xfs_buf_offset(bp, 515 (uint)bit << XFS_BLF_SHIFT), /* dest */ 516 item->ri_buf[i].i_addr, /* source */ 517 nbits<<XFS_BLF_SHIFT); /* length */ 518 next: 519 i++; 520 bit += nbits; 521 } 522 523 /* Shouldn't be any more regions */ 524 ASSERT(i == item->ri_total); 525 526 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); 527 } 528 529 /* 530 * Perform a dquot buffer recovery. 531 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 532 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 533 * Else, treat it as a regular buffer and do recovery. 534 * 535 * Return false if the buffer was tossed and true if we recovered the buffer to 536 * indicate to the caller if the buffer needs writing. 537 */ 538 STATIC bool 539 xlog_recover_do_dquot_buffer( 540 struct xfs_mount *mp, 541 struct xlog *log, 542 struct xlog_recover_item *item, 543 struct xfs_buf *bp, 544 struct xfs_buf_log_format *buf_f) 545 { 546 uint type; 547 548 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 549 550 /* 551 * Filesystems are required to send in quota flags at mount time. 552 */ 553 if (!mp->m_qflags) 554 return false; 555 556 type = 0; 557 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 558 type |= XFS_DQTYPE_USER; 559 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 560 type |= XFS_DQTYPE_PROJ; 561 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 562 type |= XFS_DQTYPE_GROUP; 563 /* 564 * This type of quotas was turned off, so ignore this buffer 565 */ 566 if (log->l_quotaoffs_flag & type) 567 return false; 568 569 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); 570 return true; 571 } 572 573 /* 574 * Perform recovery for a buffer full of inodes. In these buffers, the only 575 * data which should be recovered is that which corresponds to the 576 * di_next_unlinked pointers in the on disk inode structures. The rest of the 577 * data for the inodes is always logged through the inodes themselves rather 578 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 579 * 580 * The only time when buffers full of inodes are fully recovered is when the 581 * buffer is full of newly allocated inodes. In this case the buffer will 582 * not be marked as an inode buffer and so will be sent to 583 * xlog_recover_do_reg_buffer() below during recovery. 584 */ 585 STATIC int 586 xlog_recover_do_inode_buffer( 587 struct xfs_mount *mp, 588 struct xlog_recover_item *item, 589 struct xfs_buf *bp, 590 struct xfs_buf_log_format *buf_f) 591 { 592 int i; 593 int item_index = 0; 594 int bit = 0; 595 int nbits = 0; 596 int reg_buf_offset = 0; 597 int reg_buf_bytes = 0; 598 int next_unlinked_offset; 599 int inodes_per_buf; 600 xfs_agino_t *logged_nextp; 601 xfs_agino_t *buffer_nextp; 602 603 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 604 605 /* 606 * Post recovery validation only works properly on CRC enabled 607 * filesystems. 608 */ 609 if (xfs_has_crc(mp)) 610 bp->b_ops = &xfs_inode_buf_ops; 611 612 inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; 613 for (i = 0; i < inodes_per_buf; i++) { 614 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 615 offsetof(struct xfs_dinode, di_next_unlinked); 616 617 while (next_unlinked_offset >= 618 (reg_buf_offset + reg_buf_bytes)) { 619 /* 620 * The next di_next_unlinked field is beyond 621 * the current logged region. Find the next 622 * logged region that contains or is beyond 623 * the current di_next_unlinked field. 624 */ 625 bit += nbits; 626 bit = xfs_next_bit(buf_f->blf_data_map, 627 buf_f->blf_map_size, bit); 628 629 /* 630 * If there are no more logged regions in the 631 * buffer, then we're done. 632 */ 633 if (bit == -1) 634 return 0; 635 636 nbits = xfs_contig_bits(buf_f->blf_data_map, 637 buf_f->blf_map_size, bit); 638 ASSERT(nbits > 0); 639 reg_buf_offset = bit << XFS_BLF_SHIFT; 640 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 641 item_index++; 642 } 643 644 /* 645 * If the current logged region starts after the current 646 * di_next_unlinked field, then move on to the next 647 * di_next_unlinked field. 648 */ 649 if (next_unlinked_offset < reg_buf_offset) 650 continue; 651 652 ASSERT(item->ri_buf[item_index].i_addr != NULL); 653 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 654 ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); 655 656 /* 657 * The current logged region contains a copy of the 658 * current di_next_unlinked field. Extract its value 659 * and copy it to the buffer copy. 660 */ 661 logged_nextp = item->ri_buf[item_index].i_addr + 662 next_unlinked_offset - reg_buf_offset; 663 if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { 664 xfs_alert(mp, 665 "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " 666 "Trying to replay bad (0) inode di_next_unlinked field.", 667 item, bp); 668 return -EFSCORRUPTED; 669 } 670 671 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); 672 *buffer_nextp = *logged_nextp; 673 674 /* 675 * If necessary, recalculate the CRC in the on-disk inode. We 676 * have to leave the inode in a consistent state for whoever 677 * reads it next.... 678 */ 679 xfs_dinode_calc_crc(mp, 680 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 681 682 } 683 684 return 0; 685 } 686 687 /* 688 * V5 filesystems know the age of the buffer on disk being recovered. We can 689 * have newer objects on disk than we are replaying, and so for these cases we 690 * don't want to replay the current change as that will make the buffer contents 691 * temporarily invalid on disk. 692 * 693 * The magic number might not match the buffer type we are going to recover 694 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 695 * extract the LSN of the existing object in the buffer based on it's current 696 * magic number. If we don't recognise the magic number in the buffer, then 697 * return a LSN of -1 so that the caller knows it was an unrecognised block and 698 * so can recover the buffer. 699 * 700 * Note: we cannot rely solely on magic number matches to determine that the 701 * buffer has a valid LSN - we also need to verify that it belongs to this 702 * filesystem, so we need to extract the object's LSN and compare it to that 703 * which we read from the superblock. If the UUIDs don't match, then we've got a 704 * stale metadata block from an old filesystem instance that we need to recover 705 * over the top of. 706 */ 707 static xfs_lsn_t 708 xlog_recover_get_buf_lsn( 709 struct xfs_mount *mp, 710 struct xfs_buf *bp, 711 struct xfs_buf_log_format *buf_f) 712 { 713 uint32_t magic32; 714 uint16_t magic16; 715 uint16_t magicda; 716 void *blk = bp->b_addr; 717 uuid_t *uuid; 718 xfs_lsn_t lsn = -1; 719 uint16_t blft; 720 721 /* v4 filesystems always recover immediately */ 722 if (!xfs_has_crc(mp)) 723 goto recover_immediately; 724 725 /* 726 * realtime bitmap and summary file blocks do not have magic numbers or 727 * UUIDs, so we must recover them immediately. 728 */ 729 blft = xfs_blft_from_flags(buf_f); 730 if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) 731 goto recover_immediately; 732 733 magic32 = be32_to_cpu(*(__be32 *)blk); 734 switch (magic32) { 735 case XFS_ABTB_CRC_MAGIC: 736 case XFS_ABTC_CRC_MAGIC: 737 case XFS_ABTB_MAGIC: 738 case XFS_ABTC_MAGIC: 739 case XFS_RMAP_CRC_MAGIC: 740 case XFS_REFC_CRC_MAGIC: 741 case XFS_FIBT_CRC_MAGIC: 742 case XFS_FIBT_MAGIC: 743 case XFS_IBT_CRC_MAGIC: 744 case XFS_IBT_MAGIC: { 745 struct xfs_btree_block *btb = blk; 746 747 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 748 uuid = &btb->bb_u.s.bb_uuid; 749 break; 750 } 751 case XFS_BMAP_CRC_MAGIC: 752 case XFS_BMAP_MAGIC: { 753 struct xfs_btree_block *btb = blk; 754 755 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 756 uuid = &btb->bb_u.l.bb_uuid; 757 break; 758 } 759 case XFS_AGF_MAGIC: 760 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 761 uuid = &((struct xfs_agf *)blk)->agf_uuid; 762 break; 763 case XFS_AGFL_MAGIC: 764 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 765 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 766 break; 767 case XFS_AGI_MAGIC: 768 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 769 uuid = &((struct xfs_agi *)blk)->agi_uuid; 770 break; 771 case XFS_SYMLINK_MAGIC: 772 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 773 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 774 break; 775 case XFS_DIR3_BLOCK_MAGIC: 776 case XFS_DIR3_DATA_MAGIC: 777 case XFS_DIR3_FREE_MAGIC: 778 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 779 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 780 break; 781 case XFS_ATTR3_RMT_MAGIC: 782 /* 783 * Remote attr blocks are written synchronously, rather than 784 * being logged. That means they do not contain a valid LSN 785 * (i.e. transactionally ordered) in them, and hence any time we 786 * see a buffer to replay over the top of a remote attribute 787 * block we should simply do so. 788 */ 789 goto recover_immediately; 790 case XFS_SB_MAGIC: 791 /* 792 * superblock uuids are magic. We may or may not have a 793 * sb_meta_uuid on disk, but it will be set in the in-core 794 * superblock. We set the uuid pointer for verification 795 * according to the superblock feature mask to ensure we check 796 * the relevant UUID in the superblock. 797 */ 798 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 799 if (xfs_has_metauuid(mp)) 800 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; 801 else 802 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 803 break; 804 default: 805 break; 806 } 807 808 if (lsn != (xfs_lsn_t)-1) { 809 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 810 goto recover_immediately; 811 return lsn; 812 } 813 814 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 815 switch (magicda) { 816 case XFS_DIR3_LEAF1_MAGIC: 817 case XFS_DIR3_LEAFN_MAGIC: 818 case XFS_ATTR3_LEAF_MAGIC: 819 case XFS_DA3_NODE_MAGIC: 820 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 821 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 822 break; 823 default: 824 break; 825 } 826 827 if (lsn != (xfs_lsn_t)-1) { 828 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 829 goto recover_immediately; 830 return lsn; 831 } 832 833 /* 834 * We do individual object checks on dquot and inode buffers as they 835 * have their own individual LSN records. Also, we could have a stale 836 * buffer here, so we have to at least recognise these buffer types. 837 * 838 * A notd complexity here is inode unlinked list processing - it logs 839 * the inode directly in the buffer, but we don't know which inodes have 840 * been modified, and there is no global buffer LSN. Hence we need to 841 * recover all inode buffer types immediately. This problem will be 842 * fixed by logical logging of the unlinked list modifications. 843 */ 844 magic16 = be16_to_cpu(*(__be16 *)blk); 845 switch (magic16) { 846 case XFS_DQUOT_MAGIC: 847 case XFS_DINODE_MAGIC: 848 goto recover_immediately; 849 default: 850 break; 851 } 852 853 /* unknown buffer contents, recover immediately */ 854 855 recover_immediately: 856 return (xfs_lsn_t)-1; 857 858 } 859 860 /* 861 * This routine replays a modification made to a buffer at runtime. 862 * There are actually two types of buffer, regular and inode, which 863 * are handled differently. Inode buffers are handled differently 864 * in that we only recover a specific set of data from them, namely 865 * the inode di_next_unlinked fields. This is because all other inode 866 * data is actually logged via inode records and any data we replay 867 * here which overlaps that may be stale. 868 * 869 * When meta-data buffers are freed at run time we log a buffer item 870 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 871 * of the buffer in the log should not be replayed at recovery time. 872 * This is so that if the blocks covered by the buffer are reused for 873 * file data before we crash we don't end up replaying old, freed 874 * meta-data into a user's file. 875 * 876 * To handle the cancellation of buffer log items, we make two passes 877 * over the log during recovery. During the first we build a table of 878 * those buffers which have been cancelled, and during the second we 879 * only replay those buffers which do not have corresponding cancel 880 * records in the table. See xlog_recover_buf_pass[1,2] above 881 * for more details on the implementation of the table of cancel records. 882 */ 883 STATIC int 884 xlog_recover_buf_commit_pass2( 885 struct xlog *log, 886 struct list_head *buffer_list, 887 struct xlog_recover_item *item, 888 xfs_lsn_t current_lsn) 889 { 890 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 891 struct xfs_mount *mp = log->l_mp; 892 struct xfs_buf *bp; 893 int error; 894 uint buf_flags; 895 xfs_lsn_t lsn; 896 897 /* 898 * In this pass we only want to recover all the buffers which have 899 * not been cancelled and are not cancellation buffers themselves. 900 */ 901 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 902 if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, 903 buf_f->blf_len)) 904 goto cancelled; 905 } else { 906 907 if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, 908 buf_f->blf_len)) 909 goto cancelled; 910 } 911 912 trace_xfs_log_recover_buf_recover(log, buf_f); 913 914 buf_flags = 0; 915 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 916 buf_flags |= XBF_UNMAPPED; 917 918 error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 919 buf_flags, &bp, NULL); 920 if (error) 921 return error; 922 923 /* 924 * Recover the buffer only if we get an LSN from it and it's less than 925 * the lsn of the transaction we are replaying. 926 * 927 * Note that we have to be extremely careful of readahead here. 928 * Readahead does not attach verfiers to the buffers so if we don't 929 * actually do any replay after readahead because of the LSN we found 930 * in the buffer if more recent than that current transaction then we 931 * need to attach the verifier directly. Failure to do so can lead to 932 * future recovery actions (e.g. EFI and unlinked list recovery) can 933 * operate on the buffers and they won't get the verifier attached. This 934 * can lead to blocks on disk having the correct content but a stale 935 * CRC. 936 * 937 * It is safe to assume these clean buffers are currently up to date. 938 * If the buffer is dirtied by a later transaction being replayed, then 939 * the verifier will be reset to match whatever recover turns that 940 * buffer into. 941 */ 942 lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); 943 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 944 trace_xfs_log_recover_buf_skip(log, buf_f); 945 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); 946 947 /* 948 * We're skipping replay of this buffer log item due to the log 949 * item LSN being behind the ondisk buffer. Verify the buffer 950 * contents since we aren't going to run the write verifier. 951 */ 952 if (bp->b_ops) { 953 bp->b_ops->verify_read(bp); 954 error = bp->b_error; 955 } 956 goto out_release; 957 } 958 959 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 960 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 961 if (error) 962 goto out_release; 963 } else if (buf_f->blf_flags & 964 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 965 bool dirty; 966 967 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 968 if (!dirty) 969 goto out_release; 970 } else { 971 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 972 } 973 974 /* 975 * Perform delayed write on the buffer. Asynchronous writes will be 976 * slower when taking into account all the buffers to be flushed. 977 * 978 * Also make sure that only inode buffers with good sizes stay in 979 * the buffer cache. The kernel moves inodes in buffers of 1 block 980 * or inode_cluster_size bytes, whichever is bigger. The inode 981 * buffers in the log can be a different size if the log was generated 982 * by an older kernel using unclustered inode buffers or a newer kernel 983 * running with a different inode cluster size. Regardless, if 984 * the inode buffer size isn't max(blocksize, inode_cluster_size) 985 * for *our* value of inode_cluster_size, then we need to keep 986 * the buffer out of the buffer cache so that the buffer won't 987 * overlap with future reads of those inodes. 988 */ 989 if (XFS_DINODE_MAGIC == 990 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 991 (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { 992 xfs_buf_stale(bp); 993 error = xfs_bwrite(bp); 994 } else { 995 ASSERT(bp->b_mount == mp); 996 bp->b_flags |= _XBF_LOGRECOVERY; 997 xfs_buf_delwri_queue(bp, buffer_list); 998 } 999 1000 out_release: 1001 xfs_buf_relse(bp); 1002 return error; 1003 cancelled: 1004 trace_xfs_log_recover_buf_cancel(log, buf_f); 1005 return 0; 1006 } 1007 1008 const struct xlog_recover_item_ops xlog_buf_item_ops = { 1009 .item_type = XFS_LI_BUF, 1010 .reorder = xlog_recover_buf_reorder, 1011 .ra_pass2 = xlog_recover_buf_ra_pass2, 1012 .commit_pass1 = xlog_recover_buf_commit_pass1, 1013 .commit_pass2 = xlog_recover_buf_commit_pass2, 1014 }; 1015 1016 #ifdef DEBUG 1017 void 1018 xlog_check_buf_cancel_table( 1019 struct xlog *log) 1020 { 1021 int i; 1022 1023 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1024 ASSERT(list_empty(&log->l_buf_cancel_table[i])); 1025 } 1026 #endif 1027 1028 int 1029 xlog_alloc_buf_cancel_table( 1030 struct xlog *log) 1031 { 1032 void *p; 1033 int i; 1034 1035 ASSERT(log->l_buf_cancel_table == NULL); 1036 1037 p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), 1038 GFP_KERNEL); 1039 if (!p) 1040 return -ENOMEM; 1041 1042 log->l_buf_cancel_table = p; 1043 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1044 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 1045 1046 return 0; 1047 } 1048 1049 void 1050 xlog_free_buf_cancel_table( 1051 struct xlog *log) 1052 { 1053 int i; 1054 1055 if (!log->l_buf_cancel_table) 1056 return; 1057 1058 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { 1059 struct xfs_buf_cancel *bc; 1060 1061 while ((bc = list_first_entry_or_null( 1062 &log->l_buf_cancel_table[i], 1063 struct xfs_buf_cancel, bc_list))) { 1064 list_del(&bc->bc_list); 1065 kfree(bc); 1066 } 1067 } 1068 1069 kfree(log->l_buf_cancel_table); 1070 log->l_buf_cancel_table = NULL; 1071 } 1072
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.