~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/recovery.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /fs/bcachefs/recovery.c (Version linux-6.12-rc7) and /fs/bcachefs/recovery.c (Version linux-6.8.12)


  1 // SPDX-License-Identifier: GPL-2.0                 1 // SPDX-License-Identifier: GPL-2.0
  2                                                     2 
  3 #include "bcachefs.h"                               3 #include "bcachefs.h"
  4 #include "alloc_background.h"                  !!   4 #include "backpointers.h"
  5 #include "bkey_buf.h"                               5 #include "bkey_buf.h"
                                                   >>   6 #include "alloc_background.h"
                                                   >>   7 #include "btree_gc.h"
  6 #include "btree_journal_iter.h"                     8 #include "btree_journal_iter.h"
  7 #include "btree_node_scan.h"                   << 
  8 #include "btree_update.h"                           9 #include "btree_update.h"
  9 #include "btree_update_interior.h"                 10 #include "btree_update_interior.h"
 10 #include "btree_io.h"                              11 #include "btree_io.h"
 11 #include "buckets.h"                               12 #include "buckets.h"
 12 #include "dirent.h"                                13 #include "dirent.h"
 13 #include "disk_accounting.h"                   !!  14 #include "ec.h"
 14 #include "errcode.h"                               15 #include "errcode.h"
 15 #include "error.h"                                 16 #include "error.h"
 16 #include "fs-common.h"                             17 #include "fs-common.h"
                                                   >>  18 #include "fsck.h"
 17 #include "journal_io.h"                            19 #include "journal_io.h"
 18 #include "journal_reclaim.h"                       20 #include "journal_reclaim.h"
 19 #include "journal_seq_blacklist.h"                 21 #include "journal_seq_blacklist.h"
                                                   >>  22 #include "lru.h"
 20 #include "logged_ops.h"                            23 #include "logged_ops.h"
 21 #include "move.h"                                  24 #include "move.h"
 22 #include "quota.h"                                 25 #include "quota.h"
 23 #include "rebalance.h"                             26 #include "rebalance.h"
 24 #include "recovery.h"                              27 #include "recovery.h"
 25 #include "recovery_passes.h"                   << 
 26 #include "replicas.h"                              28 #include "replicas.h"
 27 #include "sb-clean.h"                              29 #include "sb-clean.h"
 28 #include "sb-downgrade.h"                          30 #include "sb-downgrade.h"
 29 #include "snapshot.h"                              31 #include "snapshot.h"
                                                   >>  32 #include "subvolume.h"
 30 #include "super-io.h"                              33 #include "super-io.h"
 31                                                    34 
 32 #include <linux/sort.h>                            35 #include <linux/sort.h>
 33 #include <linux/stat.h>                            36 #include <linux/stat.h>
 34                                                    37 
 35 #define QSTR(n) { { { .len = strlen(n) } }, .n     38 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 36                                                    39 
 37 void bch2_btree_lost_data(struct bch_fs *c, en !!  40 static bool btree_id_is_alloc(enum btree_id id)
 38 {                                                  41 {
 39         if (btree >= BTREE_ID_NR_MAX)          !!  42         switch (id) {
 40                 return;                        !!  43         case BTREE_ID_alloc:
 41                                                !!  44         case BTREE_ID_backpointers:
 42         u64 b = BIT_ULL(btree);                !!  45         case BTREE_ID_need_discard:
 43                                                !!  46         case BTREE_ID_freespace:
 44         if (!(c->sb.btrees_lost_data & b)) {   !!  47         case BTREE_ID_bucket_gens:
 45                 bch_err(c, "flagging btree %s  !!  48                 return true;
 46                                                !!  49         default:
 47                 mutex_lock(&c->sb_lock);       !!  50                 return false;
 48                 bch2_sb_field_get(c->disk_sb.s << 
 49                 bch2_write_super(c);           << 
 50                 mutex_unlock(&c->sb_lock);     << 
 51         }                                          51         }
 52 }                                                  52 }
 53                                                    53 
 54 /* for -o reconstruct_alloc: */                    54 /* for -o reconstruct_alloc: */
 55 static void bch2_reconstruct_alloc(struct bch_ !!  55 static void drop_alloc_keys(struct journal_keys *keys)
 56 {                                                  56 {
 57         bch2_journal_log_msg(c, "dropping allo !!  57         size_t src, dst;
 58         bch_info(c, "dropping and reconstructi << 
 59                                                    58 
 60         mutex_lock(&c->sb_lock);               !!  59         for (src = 0, dst = 0; src < keys->nr; src++)
 61         struct bch_sb_field_ext *ext = bch2_sb !!  60                 if (!btree_id_is_alloc(keys->d[src].btree_id))
 62                                                !!  61                         keys->d[dst++] = keys->d[src];
 63         __set_bit_le64(BCH_RECOVERY_PASS_STABL << 
 64         __set_bit_le64(BCH_RECOVERY_PASS_STABL << 
 65         __set_bit_le64(BCH_RECOVERY_PASS_STABL << 
 66         __set_bit_le64(BCH_RECOVERY_PASS_STABL << 
 67         __set_bit_le64(BCH_RECOVERY_PASS_STABL << 
 68                                                << 
 69         __set_bit_le64(BCH_FSCK_ERR_ptr_to_mis << 
 70         __set_bit_le64(BCH_FSCK_ERR_ptr_gen_ne << 
 71         __set_bit_le64(BCH_FSCK_ERR_stale_dirt << 
 72                                                << 
 73         __set_bit_le64(BCH_FSCK_ERR_dev_usage_ << 
 74         __set_bit_le64(BCH_FSCK_ERR_dev_usage_ << 
 75         __set_bit_le64(BCH_FSCK_ERR_dev_usage_ << 
 76                                                << 
 77         __set_bit_le64(BCH_FSCK_ERR_fs_usage_b << 
 78         __set_bit_le64(BCH_FSCK_ERR_fs_usage_c << 
 79         __set_bit_le64(BCH_FSCK_ERR_fs_usage_p << 
 80         __set_bit_le64(BCH_FSCK_ERR_fs_usage_r << 
 81                                                << 
 82         __set_bit_le64(BCH_FSCK_ERR_alloc_key_ << 
 83         __set_bit_le64(BCH_FSCK_ERR_alloc_key_ << 
 84         __set_bit_le64(BCH_FSCK_ERR_alloc_key_ << 
 85         __set_bit_le64(BCH_FSCK_ERR_alloc_key_ << 
 86         __set_bit_le64(BCH_FSCK_ERR_alloc_key_ << 
 87         __set_bit_le64(BCH_FSCK_ERR_alloc_key_ << 
 88         __set_bit_le64(BCH_FSCK_ERR_need_disca << 
 89         __set_bit_le64(BCH_FSCK_ERR_freespace_ << 
 90         __set_bit_le64(BCH_FSCK_ERR_bucket_gen << 
 91         __set_bit_le64(BCH_FSCK_ERR_freespace_ << 
 92         __set_bit_le64(BCH_FSCK_ERR_ptr_to_mis << 
 93         __set_bit_le64(BCH_FSCK_ERR_lru_entry_ << 
 94         __set_bit_le64(BCH_FSCK_ERR_accounting << 
 95         c->sb.compat &= ~(1ULL << BCH_COMPAT_a << 
 96                                                << 
 97         c->opts.recovery_passes |= bch2_recove << 
 98                                                << 
 99         bch2_write_super(c);                   << 
100         mutex_unlock(&c->sb_lock);             << 
101                                                    62 
102         bch2_shoot_down_journal_keys(c, BTREE_ !!  63         keys->nr = dst;
103                                      0, BTREE_ << 
104         bch2_shoot_down_journal_keys(c, BTREE_ << 
105                                      0, BTREE_ << 
106         bch2_shoot_down_journal_keys(c, BTREE_ << 
107                                      0, BTREE_ << 
108         bch2_shoot_down_journal_keys(c, BTREE_ << 
109                                      0, BTREE_ << 
110         bch2_shoot_down_journal_keys(c, BTREE_ << 
111                                      0, BTREE_ << 
112 }                                                  64 }
113                                                    65 
114 /*                                                 66 /*
115  * Btree node pointers have a field to stack a     67  * Btree node pointers have a field to stack a pointer to the in memory btree
116  * node; we need to zero out this field when r     68  * node; we need to zero out this field when reading in btree nodes, or when
117  * reading in keys from the journal:               69  * reading in keys from the journal:
118  */                                                70  */
119 static void zero_out_btree_mem_ptr(struct jour     71 static void zero_out_btree_mem_ptr(struct journal_keys *keys)
120 {                                                  72 {
121         darray_for_each(*keys, i)              !!  73         struct journal_key *i;
                                                   >>  74 
                                                   >>  75         for (i = keys->d; i < keys->d + keys->nr; i++)
122                 if (i->k->k.type == KEY_TYPE_b     76                 if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
123                         bkey_i_to_btree_ptr_v2     77                         bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
124 }                                                  78 }
125                                                    79 
126 /* journal replay: */                              80 /* journal replay: */
127                                                    81 
128 static void replay_now_at(struct journal *j, u     82 static void replay_now_at(struct journal *j, u64 seq)
129 {                                                  83 {
130         BUG_ON(seq < j->replay_journal_seq);       84         BUG_ON(seq < j->replay_journal_seq);
131                                                    85 
132         seq = min(seq, j->replay_journal_seq_e     86         seq = min(seq, j->replay_journal_seq_end);
133                                                    87 
134         while (j->replay_journal_seq < seq)        88         while (j->replay_journal_seq < seq)
135                 bch2_journal_pin_put(j, j->rep     89                 bch2_journal_pin_put(j, j->replay_journal_seq++);
136 }                                                  90 }
137                                                    91 
138 static int bch2_journal_replay_accounting_key( << 
139                                                << 
140 {                                              << 
141         struct btree_iter iter;                << 
142         bch2_trans_node_iter_init(trans, &iter << 
143                                   BTREE_MAX_DE << 
144                                   BTREE_ITER_i << 
145         int ret = bch2_btree_iter_traverse(&it << 
146         if (ret)                               << 
147                 goto out;                      << 
148                                                << 
149         struct bkey u;                         << 
150         struct bkey_s_c old = bch2_btree_path_ << 
151                                                << 
152         /* Has this delta already been applied << 
153         if (bversion_cmp(old.k->bversion, k->k << 
154                 ret = 0;                       << 
155                 goto out;                      << 
156         }                                      << 
157                                                << 
158         struct bkey_i *new = k->k;             << 
159         if (old.k->type == KEY_TYPE_accounting << 
160                 new = bch2_bkey_make_mut_noupd << 
161                 ret = PTR_ERR_OR_ZERO(new);    << 
162                 if (ret)                       << 
163                         goto out;              << 
164                                                << 
165                 bch2_accounting_accumulate(bke << 
166                                            bke << 
167         }                                      << 
168                                                << 
169         trans->journal_res.seq = k->journal_se << 
170                                                << 
171         ret = bch2_trans_update(trans, &iter,  << 
172 out:                                           << 
173         bch2_trans_iter_exit(trans, &iter);    << 
174         return ret;                            << 
175 }                                              << 
176                                                << 
177 static int bch2_journal_replay_key(struct btre     92 static int bch2_journal_replay_key(struct btree_trans *trans,
178                                    struct jour     93                                    struct journal_key *k)
179 {                                                  94 {
180         struct btree_iter iter;                    95         struct btree_iter iter;
181         unsigned iter_flags =                      96         unsigned iter_flags =
182                 BTREE_ITER_intent|             !!  97                 BTREE_ITER_INTENT|
183                 BTREE_ITER_not_extents;        !!  98                 BTREE_ITER_NOT_EXTENTS;
184         unsigned update_flags = BTREE_TRIGGER_ !!  99         unsigned update_flags = BTREE_TRIGGER_NORUN;
185         int ret;                                  100         int ret;
186                                                   101 
187         if (k->overwritten)                       102         if (k->overwritten)
188                 return 0;                         103                 return 0;
189                                                   104 
190         trans->journal_res.seq = k->journal_se    105         trans->journal_res.seq = k->journal_seq;
191                                                   106 
192         /*                                        107         /*
193          * BTREE_UPDATE_key_cache_reclaim disa !! 108          * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
194          * keep the key cache coherent with th    109          * keep the key cache coherent with the underlying btree. Nothing
195          * besides the allocator is doing upda    110          * besides the allocator is doing updates yet so we don't need key cache
196          * coherency for non-alloc btrees, and    111          * coherency for non-alloc btrees, and key cache fills for snapshots
197          * btrees use BTREE_ITER_filter_snapsh !! 112          * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
198          * the snapshots recovery pass runs.      113          * the snapshots recovery pass runs.
199          */                                       114          */
200         if (!k->level && k->btree_id == BTREE_    115         if (!k->level && k->btree_id == BTREE_ID_alloc)
201                 iter_flags |= BTREE_ITER_cache !! 116                 iter_flags |= BTREE_ITER_CACHED;
202         else                                      117         else
203                 update_flags |= BTREE_UPDATE_k !! 118                 update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
204                                                   119 
205         bch2_trans_node_iter_init(trans, &iter    120         bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
206                                   BTREE_MAX_DE    121                                   BTREE_MAX_DEPTH, k->level,
207                                   iter_flags);    122                                   iter_flags);
208         ret = bch2_btree_iter_traverse(&iter);    123         ret = bch2_btree_iter_traverse(&iter);
209         if (ret)                                  124         if (ret)
210                 goto out;                         125                 goto out;
211                                                   126 
212         struct btree_path *path = btree_iter_p << 
213         if (unlikely(!btree_path_node(path, k- << 
214                 bch2_trans_iter_exit(trans, &i << 
215                 bch2_trans_node_iter_init(tran << 
216                                           BTRE << 
217                 ret =   bch2_btree_iter_traver << 
218                         bch2_btree_increase_de << 
219                         -BCH_ERR_transaction_r << 
220                 goto out;                      << 
221         }                                      << 
222                                                << 
223         /* Must be checked with btree locked:     127         /* Must be checked with btree locked: */
224         if (k->overwritten)                       128         if (k->overwritten)
225                 goto out;                         129                 goto out;
226                                                   130 
227         if (k->k->k.type == KEY_TYPE_accountin << 
228                 ret = bch2_trans_update_buffer << 
229                 goto out;                      << 
230         }                                      << 
231                                                << 
232         ret = bch2_trans_update(trans, &iter,     131         ret = bch2_trans_update(trans, &iter, k->k, update_flags);
233 out:                                              132 out:
234         bch2_trans_iter_exit(trans, &iter);       133         bch2_trans_iter_exit(trans, &iter);
235         return ret;                               134         return ret;
236 }                                                 135 }
237                                                   136 
238 static int journal_sort_seq_cmp(const void *_l    137 static int journal_sort_seq_cmp(const void *_l, const void *_r)
239 {                                                 138 {
240         const struct journal_key *l = *((const    139         const struct journal_key *l = *((const struct journal_key **)_l);
241         const struct journal_key *r = *((const    140         const struct journal_key *r = *((const struct journal_key **)_r);
242                                                   141 
243         /*                                     !! 142         return cmp_int(l->journal_seq, r->journal_seq);
244          * Map 0 to U64_MAX, so that keys with << 
245          *                                     << 
246          * journal_seq == 0 means that the key << 
247          * should be inserted last so as to av << 
248          */                                    << 
249         return cmp_int(l->journal_seq - 1, r-> << 
250 }                                                 143 }
251                                                   144 
252 int bch2_journal_replay(struct bch_fs *c)      !! 145 static int bch2_journal_replay(struct bch_fs *c)
253 {                                                 146 {
254         struct journal_keys *keys = &c->journa    147         struct journal_keys *keys = &c->journal_keys;
255         DARRAY(struct journal_key *) keys_sort    148         DARRAY(struct journal_key *) keys_sorted = { 0 };
256         struct journal *j = &c->journal;          149         struct journal *j = &c->journal;
257         u64 start_seq   = c->journal_replay_se    150         u64 start_seq   = c->journal_replay_seq_start;
258         u64 end_seq     = c->journal_replay_se    151         u64 end_seq     = c->journal_replay_seq_start;
259         struct btree_trans *trans = NULL;      !! 152         struct btree_trans *trans = bch2_trans_get(c);
260         bool immediate_flush = false;          << 
261         int ret = 0;                              153         int ret = 0;
262                                                   154 
263         if (keys->nr) {                           155         if (keys->nr) {
264                 ret = bch2_journal_log_msg(c,     156                 ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
265                                            key    157                                            keys->nr, start_seq, end_seq);
266                 if (ret)                          158                 if (ret)
267                         goto err;                 159                         goto err;
268         }                                         160         }
269                                                   161 
270         BUG_ON(!atomic_read(&keys->ref));         162         BUG_ON(!atomic_read(&keys->ref));
271                                                   163 
272         move_gap(keys, keys->nr);              << 
273         trans = bch2_trans_get(c);             << 
274                                                << 
275         /*                                     << 
276          * Replay accounting keys first: we ca << 
277          * flush accounting keys until we're d << 
278          */                                    << 
279         darray_for_each(*keys, k) {            << 
280                 if (!(k->k->k.type == KEY_TYPE << 
281                         continue;              << 
282                                                << 
283                 cond_resched();                << 
284                                                << 
285                 ret = commit_do(trans, NULL, N << 
286                                 BCH_TRANS_COMM << 
287                                 BCH_TRANS_COMM << 
288                                 BCH_TRANS_COMM << 
289                                 BCH_TRANS_COMM << 
290                                 BCH_WATERMARK_ << 
291                              bch2_journal_repl << 
292                 if (bch2_fs_fatal_err_on(ret,  << 
293                         goto err;              << 
294                                                << 
295                 k->overwritten = true;         << 
296         }                                      << 
297                                                << 
298         set_bit(BCH_FS_accounting_replay_done, << 
299                                                << 
300         /*                                        164         /*
301          * First, attempt to replay keys in so    165          * First, attempt to replay keys in sorted order. This is more
302          * efficient - better locality of btre    166          * efficient - better locality of btree access -  but some might fail if
303          * that would cause a journal deadlock    167          * that would cause a journal deadlock.
304          */                                       168          */
305         darray_for_each(*keys, k) {            !! 169         for (size_t i = 0; i < keys->nr; i++) {
306                 cond_resched();                   170                 cond_resched();
307                                                   171 
308                 /*                             !! 172                 struct journal_key *k = keys->d + i;
309                  * k->allocated means the key  << 
310                  * rather it was from early re << 
311                  */                            << 
312                 if (k->allocated)              << 
313                         immediate_flush = true << 
314                                                   173 
315                 /* Skip fastpath if we're low     174                 /* Skip fastpath if we're low on space in the journal */
316                 ret = c->journal.watermark ? -    175                 ret = c->journal.watermark ? -1 :
317                         commit_do(trans, NULL,    176                         commit_do(trans, NULL, NULL,
318                                   BCH_TRANS_CO    177                                   BCH_TRANS_COMMIT_no_enospc|
319                                   BCH_TRANS_CO    178                                   BCH_TRANS_COMMIT_journal_reclaim|
320                                   BCH_TRANS_CO << 
321                                   (!k->allocat    179                                   (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
322                              bch2_journal_repl    180                              bch2_journal_replay_key(trans, k));
323                 BUG_ON(!ret && !k->overwritten !! 181                 BUG_ON(!ret && !k->overwritten);
324                 if (ret) {                        182                 if (ret) {
325                         ret = darray_push(&key    183                         ret = darray_push(&keys_sorted, k);
326                         if (ret)                  184                         if (ret)
327                                 goto err;         185                                 goto err;
328                 }                                 186                 }
329         }                                         187         }
330                                                   188 
331         bch2_trans_unlock_long(trans);         << 
332         /*                                        189         /*
333          * Now, replay any remaining keys in t    190          * Now, replay any remaining keys in the order in which they appear in
334          * the journal, unpinning those journa    191          * the journal, unpinning those journal entries as we go:
335          */                                       192          */
336         sort(keys_sorted.data, keys_sorted.nr,    193         sort(keys_sorted.data, keys_sorted.nr,
337              sizeof(keys_sorted.data[0]),         194              sizeof(keys_sorted.data[0]),
338              journal_sort_seq_cmp, NULL);         195              journal_sort_seq_cmp, NULL);
339                                                   196 
340         darray_for_each(keys_sorted, kp) {        197         darray_for_each(keys_sorted, kp) {
341                 cond_resched();                   198                 cond_resched();
342                                                   199 
343                 struct journal_key *k = *kp;      200                 struct journal_key *k = *kp;
344                                                   201 
345                 if (k->journal_seq)            !! 202                 replay_now_at(j, k->journal_seq);
346                         replay_now_at(j, k->jo << 
347                 else                           << 
348                         replay_now_at(j, j->re << 
349                                                   203 
350                 ret = commit_do(trans, NULL, N    204                 ret = commit_do(trans, NULL, NULL,
351                                 BCH_TRANS_COMM    205                                 BCH_TRANS_COMMIT_no_enospc|
352                                 BCH_TRANS_COMM << 
353                                 (!k->allocated    206                                 (!k->allocated
354                                  ? BCH_TRANS_C    207                                  ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
355                                  : 0),            208                                  : 0),
356                              bch2_journal_repl    209                              bch2_journal_replay_key(trans, k));
357                 bch_err_msg(c, ret, "while rep    210                 bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
358                             bch2_btree_id_str(    211                             bch2_btree_id_str(k->btree_id), k->level);
359                 if (ret)                          212                 if (ret)
360                         goto err;                 213                         goto err;
361                                                   214 
362                 BUG_ON(k->btree_id != BTREE_ID !! 215                 BUG_ON(!k->overwritten);
363         }                                         216         }
364                                                   217 
365         /*                                        218         /*
366          * We need to put our btree_trans befo    219          * We need to put our btree_trans before calling flush_all_pins(), since
367          * that will use a btree_trans interna    220          * that will use a btree_trans internally
368          */                                       221          */
369         bch2_trans_put(trans);                    222         bch2_trans_put(trans);
370         trans = NULL;                             223         trans = NULL;
371                                                   224 
372         if (!c->opts.retain_recovery_info &&   !! 225         if (!c->opts.keep_journal)
373             c->recovery_pass_done >= BCH_RECOV << 
374                 bch2_journal_keys_put_initial(    226                 bch2_journal_keys_put_initial(c);
375                                                   227 
376         replay_now_at(j, j->replay_journal_seq    228         replay_now_at(j, j->replay_journal_seq_end);
377         j->replay_journal_seq = 0;                229         j->replay_journal_seq = 0;
378                                                   230 
379         bch2_journal_set_replay_done(j);          231         bch2_journal_set_replay_done(j);
380                                                   232 
381         /* if we did any repair, flush it imme << 
382         if (immediate_flush) {                 << 
383                 bch2_journal_flush_all_pins(&c << 
384                 ret = bch2_journal_meta(&c->jo << 
385         }                                      << 
386                                                << 
387         if (keys->nr)                             233         if (keys->nr)
388                 bch2_journal_log_msg(c, "journ    234                 bch2_journal_log_msg(c, "journal replay finished");
389 err:                                              235 err:
390         if (trans)                                236         if (trans)
391                 bch2_trans_put(trans);            237                 bch2_trans_put(trans);
392         darray_exit(&keys_sorted);                238         darray_exit(&keys_sorted);
393         bch_err_fn(c, ret);                       239         bch_err_fn(c, ret);
394         return ret;                               240         return ret;
395 }                                                 241 }
396                                                   242 
397 /* journal replay early: */                       243 /* journal replay early: */
398                                                   244 
399 static int journal_replay_entry_early(struct b    245 static int journal_replay_entry_early(struct bch_fs *c,
400                                       struct j    246                                       struct jset_entry *entry)
401 {                                                 247 {
402         int ret = 0;                              248         int ret = 0;
403                                                   249 
404         switch (entry->type) {                    250         switch (entry->type) {
405         case BCH_JSET_ENTRY_btree_root: {         251         case BCH_JSET_ENTRY_btree_root: {
406                 struct btree_root *r;             252                 struct btree_root *r;
407                                                   253 
408                 if (fsck_err_on(entry->btree_i << 
409                                 c, invalid_btr << 
410                                 "invalid btree << 
411                                 entry->btree_i << 
412                         return 0;              << 
413                                                << 
414                 while (entry->btree_id >= c->b    254                 while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
415                         ret = darray_push(&c->    255                         ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
416                         if (ret)                  256                         if (ret)
417                                 return ret;       257                                 return ret;
418                 }                                 258                 }
419                                                   259 
420                 r = bch2_btree_id_root(c, entr    260                 r = bch2_btree_id_root(c, entry->btree_id);
421                                                   261 
422                 if (entry->u64s) {                262                 if (entry->u64s) {
423                         r->level = entry->leve    263                         r->level = entry->level;
424                         bkey_copy(&r->key, (st    264                         bkey_copy(&r->key, (struct bkey_i *) entry->start);
425                         r->error = 0;             265                         r->error = 0;
426                 } else {                          266                 } else {
427                         r->error = -BCH_ERR_bt !! 267                         r->error = -EIO;
428                 }                                 268                 }
429                 r->alive = true;                  269                 r->alive = true;
430                 break;                            270                 break;
431         }                                         271         }
432         case BCH_JSET_ENTRY_usage: {              272         case BCH_JSET_ENTRY_usage: {
433                 struct jset_entry_usage *u =      273                 struct jset_entry_usage *u =
434                         container_of(entry, st    274                         container_of(entry, struct jset_entry_usage, entry);
435                                                   275 
436                 switch (entry->btree_id) {        276                 switch (entry->btree_id) {
                                                   >> 277                 case BCH_FS_USAGE_reserved:
                                                   >> 278                         if (entry->level < BCH_REPLICAS_MAX)
                                                   >> 279                                 c->usage_base->persistent_reserved[entry->level] =
                                                   >> 280                                         le64_to_cpu(u->v);
                                                   >> 281                         break;
                                                   >> 282                 case BCH_FS_USAGE_inodes:
                                                   >> 283                         c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
                                                   >> 284                         break;
437                 case BCH_FS_USAGE_key_version:    285                 case BCH_FS_USAGE_key_version:
438                         atomic64_set(&c->key_v !! 286                         atomic64_set(&c->key_version,
                                                   >> 287                                      le64_to_cpu(u->v));
439                         break;                    288                         break;
440                 }                                 289                 }
                                                   >> 290 
                                                   >> 291                 break;
                                                   >> 292         }
                                                   >> 293         case BCH_JSET_ENTRY_data_usage: {
                                                   >> 294                 struct jset_entry_data_usage *u =
                                                   >> 295                         container_of(entry, struct jset_entry_data_usage, entry);
                                                   >> 296 
                                                   >> 297                 ret = bch2_replicas_set_usage(c, &u->r,
                                                   >> 298                                               le64_to_cpu(u->v));
                                                   >> 299                 break;
                                                   >> 300         }
                                                   >> 301         case BCH_JSET_ENTRY_dev_usage: {
                                                   >> 302                 struct jset_entry_dev_usage *u =
                                                   >> 303                         container_of(entry, struct jset_entry_dev_usage, entry);
                                                   >> 304                 struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
                                                   >> 305                 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
                                                   >> 306 
                                                   >> 307                 for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
                                                   >> 308                         ca->usage_base->d[i].buckets    = le64_to_cpu(u->d[i].buckets);
                                                   >> 309                         ca->usage_base->d[i].sectors    = le64_to_cpu(u->d[i].sectors);
                                                   >> 310                         ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
                                                   >> 311                 }
                                                   >> 312 
441                 break;                            313                 break;
442         }                                         314         }
443         case BCH_JSET_ENTRY_blacklist: {          315         case BCH_JSET_ENTRY_blacklist: {
444                 struct jset_entry_blacklist *b    316                 struct jset_entry_blacklist *bl_entry =
445                         container_of(entry, st    317                         container_of(entry, struct jset_entry_blacklist, entry);
446                                                   318 
447                 ret = bch2_journal_seq_blackli    319                 ret = bch2_journal_seq_blacklist_add(c,
448                                 le64_to_cpu(bl    320                                 le64_to_cpu(bl_entry->seq),
449                                 le64_to_cpu(bl    321                                 le64_to_cpu(bl_entry->seq) + 1);
450                 break;                            322                 break;
451         }                                         323         }
452         case BCH_JSET_ENTRY_blacklist_v2: {       324         case BCH_JSET_ENTRY_blacklist_v2: {
453                 struct jset_entry_blacklist_v2    325                 struct jset_entry_blacklist_v2 *bl_entry =
454                         container_of(entry, st    326                         container_of(entry, struct jset_entry_blacklist_v2, entry);
455                                                   327 
456                 ret = bch2_journal_seq_blackli    328                 ret = bch2_journal_seq_blacklist_add(c,
457                                 le64_to_cpu(bl    329                                 le64_to_cpu(bl_entry->start),
458                                 le64_to_cpu(bl    330                                 le64_to_cpu(bl_entry->end) + 1);
459                 break;                            331                 break;
460         }                                         332         }
461         case BCH_JSET_ENTRY_clock: {              333         case BCH_JSET_ENTRY_clock: {
462                 struct jset_entry_clock *clock    334                 struct jset_entry_clock *clock =
463                         container_of(entry, st    335                         container_of(entry, struct jset_entry_clock, entry);
464                                                   336 
465                 atomic64_set(&c->io_clock[cloc    337                 atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
466         }                                         338         }
467         }                                         339         }
468 fsck_err:                                      !! 340 
469         return ret;                               341         return ret;
470 }                                                 342 }
471                                                   343 
472 static int journal_replay_early(struct bch_fs     344 static int journal_replay_early(struct bch_fs *c,
473                                 struct bch_sb_    345                                 struct bch_sb_field_clean *clean)
474 {                                                 346 {
475         if (clean) {                              347         if (clean) {
476                 for (struct jset_entry *entry     348                 for (struct jset_entry *entry = clean->start;
477                      entry != vstruct_end(&cle    349                      entry != vstruct_end(&clean->field);
478                      entry = vstruct_next(entr    350                      entry = vstruct_next(entry)) {
479                         int ret = journal_repl    351                         int ret = journal_replay_entry_early(c, entry);
480                         if (ret)                  352                         if (ret)
481                                 return ret;       353                                 return ret;
482                 }                                 354                 }
483         } else {                                  355         } else {
484                 struct genradix_iter iter;        356                 struct genradix_iter iter;
485                 struct journal_replay *i, **_i    357                 struct journal_replay *i, **_i;
486                                                   358 
487                 genradix_for_each(&c->journal_    359                 genradix_for_each(&c->journal_entries, iter, _i) {
488                         i = *_i;                  360                         i = *_i;
489                                                   361 
490                         if (journal_replay_ign !! 362                         if (!i || i->ignore)
491                                 continue;         363                                 continue;
492                                                   364 
493                         vstruct_for_each(&i->j    365                         vstruct_for_each(&i->j, entry) {
494                                 int ret = jour    366                                 int ret = journal_replay_entry_early(c, entry);
495                                 if (ret)          367                                 if (ret)
496                                         return    368                                         return ret;
497                         }                         369                         }
498                 }                                 370                 }
499         }                                         371         }
500                                                   372 
                                                   >> 373         bch2_fs_usage_initialize(c);
                                                   >> 374 
501         return 0;                                 375         return 0;
502 }                                                 376 }
503                                                   377 
504 /* sb clean section: */                           378 /* sb clean section: */
505                                                   379 
506 static int read_btree_roots(struct bch_fs *c)     380 static int read_btree_roots(struct bch_fs *c)
507 {                                                 381 {
                                                   >> 382         unsigned i;
508         int ret = 0;                              383         int ret = 0;
509                                                   384 
510         for (unsigned i = 0; i < btree_id_nr_a !! 385         for (i = 0; i < btree_id_nr_alive(c); i++) {
511                 struct btree_root *r = bch2_bt    386                 struct btree_root *r = bch2_btree_id_root(c, i);
512                                                   387 
513                 if (!r->alive)                    388                 if (!r->alive)
514                         continue;                 389                         continue;
515                                                   390 
516                 if (btree_id_is_alloc(i) && c- !! 391                 if (btree_id_is_alloc(i) &&
                                                   >> 392                     c->opts.reconstruct_alloc) {
                                                   >> 393                         c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
517                         continue;                 394                         continue;
                                                   >> 395                 }
518                                                   396 
519                 if (mustfix_fsck_err_on((ret = !! 397                 if (r->error) {
520                                         c, btr !! 398                         __fsck_err(c,
521                                         "inval !! 399                                    btree_id_is_alloc(i)
522                                         bch2_b !! 400                                    ? FSCK_CAN_IGNORE : 0,
523                     mustfix_fsck_err_on((ret = !! 401                                    btree_root_bkey_invalid,
524                                         c, btr !! 402                                    "invalid btree root %s",
525                                         "error !! 403                                    bch2_btree_id_str(i));
526                                         bch2_b !! 404                         if (i == BTREE_ID_alloc)
527                         if (btree_id_is_alloc( << 
528                                 c->opts.recove << 
529                                 c->opts.recove << 
530                                 c->opts.recove << 
531                                 c->opts.recove << 
532                                 c->opts.recove << 
533                                 c->sb.compat &    405                                 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
534                                 r->error = 0;  !! 406                 }
535                         } else if (!(c->opts.r << 
536                                 bch_info(c, "w << 
537                                 c->opts.recove << 
538                                 c->opts.recove << 
539                         }                      << 
540                                                   407 
                                                   >> 408                 ret = bch2_btree_root_read(c, i, &r->key, r->level);
                                                   >> 409                 if (ret) {
                                                   >> 410                         fsck_err(c,
                                                   >> 411                                  btree_root_read_error,
                                                   >> 412                                  "error reading btree root %s",
                                                   >> 413                                  bch2_btree_id_str(i));
                                                   >> 414                         if (btree_id_is_alloc(i))
                                                   >> 415                                 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
541                         ret = 0;                  416                         ret = 0;
542                         bch2_btree_lost_data(c << 
543                 }                                 417                 }
544         }                                         418         }
545                                                   419 
546         for (unsigned i = 0; i < BTREE_ID_NR;  !! 420         for (i = 0; i < BTREE_ID_NR; i++) {
547                 struct btree_root *r = bch2_bt    421                 struct btree_root *r = bch2_btree_id_root(c, i);
548                                                   422 
549                 if (!r->b && !r->error) {      !! 423                 if (!r->b) {
550                         r->alive = false;         424                         r->alive = false;
551                         r->level = 0;             425                         r->level = 0;
552                         bch2_btree_root_alloc_ !! 426                         bch2_btree_root_alloc(c, i);
553                 }                                 427                 }
554         }                                         428         }
555 fsck_err:                                         429 fsck_err:
556         return ret;                               430         return ret;
557 }                                                 431 }
558                                                   432 
                                                   >> 433 static int bch2_initialize_subvolumes(struct bch_fs *c)
                                                   >> 434 {
                                                   >> 435         struct bkey_i_snapshot_tree     root_tree;
                                                   >> 436         struct bkey_i_snapshot          root_snapshot;
                                                   >> 437         struct bkey_i_subvolume         root_volume;
                                                   >> 438         int ret;
                                                   >> 439 
                                                   >> 440         bkey_snapshot_tree_init(&root_tree.k_i);
                                                   >> 441         root_tree.k.p.offset            = 1;
                                                   >> 442         root_tree.v.master_subvol       = cpu_to_le32(1);
                                                   >> 443         root_tree.v.root_snapshot       = cpu_to_le32(U32_MAX);
                                                   >> 444 
                                                   >> 445         bkey_snapshot_init(&root_snapshot.k_i);
                                                   >> 446         root_snapshot.k.p.offset = U32_MAX;
                                                   >> 447         root_snapshot.v.flags   = 0;
                                                   >> 448         root_snapshot.v.parent  = 0;
                                                   >> 449         root_snapshot.v.subvol  = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
                                                   >> 450         root_snapshot.v.tree    = cpu_to_le32(1);
                                                   >> 451         SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
                                                   >> 452 
                                                   >> 453         bkey_subvolume_init(&root_volume.k_i);
                                                   >> 454         root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
                                                   >> 455         root_volume.v.flags     = 0;
                                                   >> 456         root_volume.v.snapshot  = cpu_to_le32(U32_MAX);
                                                   >> 457         root_volume.v.inode     = cpu_to_le64(BCACHEFS_ROOT_INO);
                                                   >> 458 
                                                   >> 459         ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,   &root_tree.k_i, NULL, 0) ?:
                                                   >> 460                 bch2_btree_insert(c, BTREE_ID_snapshots,        &root_snapshot.k_i, NULL, 0) ?:
                                                   >> 461                 bch2_btree_insert(c, BTREE_ID_subvolumes,       &root_volume.k_i, NULL, 0);
                                                   >> 462         bch_err_fn(c, ret);
                                                   >> 463         return ret;
                                                   >> 464 }
                                                   >> 465 
                                                   >> 466 static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
                                                   >> 467 {
                                                   >> 468         struct btree_iter iter;
                                                   >> 469         struct bkey_s_c k;
                                                   >> 470         struct bch_inode_unpacked inode;
                                                   >> 471         int ret;
                                                   >> 472 
                                                   >> 473         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
                                                   >> 474                                SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
                                                   >> 475         ret = bkey_err(k);
                                                   >> 476         if (ret)
                                                   >> 477                 return ret;
                                                   >> 478 
                                                   >> 479         if (!bkey_is_inode(k.k)) {
                                                   >> 480                 bch_err(trans->c, "root inode not found");
                                                   >> 481                 ret = -BCH_ERR_ENOENT_inode;
                                                   >> 482                 goto err;
                                                   >> 483         }
                                                   >> 484 
                                                   >> 485         ret = bch2_inode_unpack(k, &inode);
                                                   >> 486         BUG_ON(ret);
                                                   >> 487 
                                                   >> 488         inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
                                                   >> 489 
                                                   >> 490         ret = bch2_inode_write(trans, &iter, &inode);
                                                   >> 491 err:
                                                   >> 492         bch2_trans_iter_exit(trans, &iter);
                                                   >> 493         return ret;
                                                   >> 494 }
                                                   >> 495 
                                                   >> 496 /* set bi_subvol on root inode */
                                                   >> 497 noinline_for_stack
                                                   >> 498 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
                                                   >> 499 {
                                                   >> 500         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
                                                   >> 501                                 __bch2_fs_upgrade_for_subvolumes(trans));
                                                   >> 502         bch_err_fn(c, ret);
                                                   >> 503         return ret;
                                                   >> 504 }
                                                   >> 505 
                                                   >> 506 const char * const bch2_recovery_passes[] = {
                                                   >> 507 #define x(_fn, ...)     #_fn,
                                                   >> 508         BCH_RECOVERY_PASSES()
                                                   >> 509 #undef x
                                                   >> 510         NULL
                                                   >> 511 };
                                                   >> 512 
                                                   >> 513 static int bch2_check_allocations(struct bch_fs *c)
                                                   >> 514 {
                                                   >> 515         return bch2_gc(c, true, c->opts.norecovery);
                                                   >> 516 }
                                                   >> 517 
                                                   >> 518 static int bch2_set_may_go_rw(struct bch_fs *c)
                                                   >> 519 {
                                                   >> 520         struct journal_keys *keys = &c->journal_keys;
                                                   >> 521 
                                                   >> 522         /*
                                                   >> 523          * After we go RW, the journal keys buffer can't be modified (except for
                                                   >> 524          * setting journal_key->overwritten: it will be accessed by multiple
                                                   >> 525          * threads
                                                   >> 526          */
                                                   >> 527         move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
                                                   >> 528         keys->gap = keys->nr;
                                                   >> 529 
                                                   >> 530         set_bit(BCH_FS_may_go_rw, &c->flags);
                                                   >> 531 
                                                   >> 532         if (keys->nr || c->opts.fsck || !c->sb.clean)
                                                   >> 533                 return bch2_fs_read_write_early(c);
                                                   >> 534         return 0;
                                                   >> 535 }
                                                   >> 536 
                                                   >> 537 struct recovery_pass_fn {
                                                   >> 538         int             (*fn)(struct bch_fs *);
                                                   >> 539         unsigned        when;
                                                   >> 540 };
                                                   >> 541 
                                                   >> 542 static struct recovery_pass_fn recovery_pass_fns[] = {
                                                   >> 543 #define x(_fn, _id, _when)      { .fn = bch2_##_fn, .when = _when },
                                                   >> 544         BCH_RECOVERY_PASSES()
                                                   >> 545 #undef x
                                                   >> 546 };
                                                   >> 547 
                                                   >> 548 u64 bch2_recovery_passes_to_stable(u64 v)
                                                   >> 549 {
                                                   >> 550         static const u8 map[] = {
                                                   >> 551 #define x(n, id, ...)   [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
                                                   >> 552         BCH_RECOVERY_PASSES()
                                                   >> 553 #undef x
                                                   >> 554         };
                                                   >> 555 
                                                   >> 556         u64 ret = 0;
                                                   >> 557         for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
                                                   >> 558                 if (v & BIT_ULL(i))
                                                   >> 559                         ret |= BIT_ULL(map[i]);
                                                   >> 560         return ret;
                                                   >> 561 }
                                                   >> 562 
                                                   >> 563 u64 bch2_recovery_passes_from_stable(u64 v)
                                                   >> 564 {
                                                   >> 565         static const u8 map[] = {
                                                   >> 566 #define x(n, id, ...)   [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
                                                   >> 567         BCH_RECOVERY_PASSES()
                                                   >> 568 #undef x
                                                   >> 569         };
                                                   >> 570 
                                                   >> 571         u64 ret = 0;
                                                   >> 572         for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
                                                   >> 573                 if (v & BIT_ULL(i))
                                                   >> 574                         ret |= BIT_ULL(map[i]);
                                                   >> 575         return ret;
                                                   >> 576 }
                                                   >> 577 
559 static bool check_version_upgrade(struct bch_f    578 static bool check_version_upgrade(struct bch_fs *c)
560 {                                                 579 {
561         unsigned latest_version = bcachefs_met    580         unsigned latest_version = bcachefs_metadata_version_current;
562         unsigned latest_compatible = min(lates    581         unsigned latest_compatible = min(latest_version,
563                                          bch2_    582                                          bch2_latest_compatible_version(c->sb.version));
564         unsigned old_version = c->sb.version_u    583         unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
565         unsigned new_version = 0;                 584         unsigned new_version = 0;
566                                                   585 
567         if (old_version < bcachefs_metadata_re    586         if (old_version < bcachefs_metadata_required_upgrade_below) {
568                 if (c->opts.version_upgrade ==    587                 if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
569                     latest_compatible < bcache    588                     latest_compatible < bcachefs_metadata_required_upgrade_below)
570                         new_version = latest_v    589                         new_version = latest_version;
571                 else                              590                 else
572                         new_version = latest_c    591                         new_version = latest_compatible;
573         } else {                                  592         } else {
574                 switch (c->opts.version_upgrad    593                 switch (c->opts.version_upgrade) {
575                 case BCH_VERSION_UPGRADE_compa    594                 case BCH_VERSION_UPGRADE_compatible:
576                         new_version = latest_c    595                         new_version = latest_compatible;
577                         break;                    596                         break;
578                 case BCH_VERSION_UPGRADE_incom    597                 case BCH_VERSION_UPGRADE_incompatible:
579                         new_version = latest_v    598                         new_version = latest_version;
580                         break;                    599                         break;
581                 case BCH_VERSION_UPGRADE_none:    600                 case BCH_VERSION_UPGRADE_none:
582                         new_version = min(old_    601                         new_version = min(old_version, latest_version);
583                         break;                    602                         break;
584                 }                                 603                 }
585         }                                         604         }
586                                                   605 
587         if (new_version > old_version) {          606         if (new_version > old_version) {
588                 struct printbuf buf = PRINTBUF    607                 struct printbuf buf = PRINTBUF;
589                                                   608 
590                 if (old_version < bcachefs_met    609                 if (old_version < bcachefs_metadata_required_upgrade_below)
591                         prt_str(&buf, "Version    610                         prt_str(&buf, "Version upgrade required:\n");
592                                                   611 
593                 if (old_version != c->sb.versi    612                 if (old_version != c->sb.version) {
594                         prt_str(&buf, "Version    613                         prt_str(&buf, "Version upgrade from ");
595                         bch2_version_to_text(&    614                         bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
596                         prt_str(&buf, " to ");    615                         prt_str(&buf, " to ");
597                         bch2_version_to_text(&    616                         bch2_version_to_text(&buf, c->sb.version);
598                         prt_str(&buf, " incomp    617                         prt_str(&buf, " incomplete\n");
599                 }                                 618                 }
600                                                   619 
601                 prt_printf(&buf, "Doing %s ver    620                 prt_printf(&buf, "Doing %s version upgrade from ",
602                            BCH_VERSION_MAJOR(o    621                            BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
603                            ? "incompatible" :     622                            ? "incompatible" : "compatible");
604                 bch2_version_to_text(&buf, old    623                 bch2_version_to_text(&buf, old_version);
605                 prt_str(&buf, " to ");            624                 prt_str(&buf, " to ");
606                 bch2_version_to_text(&buf, new    625                 bch2_version_to_text(&buf, new_version);
607                 prt_newline(&buf);                626                 prt_newline(&buf);
608                                                   627 
609                 struct bch_sb_field_ext *ext =    628                 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
610                 __le64 passes = ext->recovery_    629                 __le64 passes = ext->recovery_passes_required[0];
611                 bch2_sb_set_upgrade(c, old_ver    630                 bch2_sb_set_upgrade(c, old_version, new_version);
612                 passes = ext->recovery_passes_    631                 passes = ext->recovery_passes_required[0] & ~passes;
613                                                   632 
614                 if (passes) {                     633                 if (passes) {
615                         prt_str(&buf, "  runni    634                         prt_str(&buf, "  running recovery passes: ");
616                         prt_bitflags(&buf, bch    635                         prt_bitflags(&buf, bch2_recovery_passes,
617                                      bch2_reco    636                                      bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
618                 }                                 637                 }
619                                                   638 
620                 bch_info(c, "%s", buf.buf);       639                 bch_info(c, "%s", buf.buf);
621                                                   640 
622                 bch2_sb_upgrade(c, new_version    641                 bch2_sb_upgrade(c, new_version);
623                                                   642 
624                 printbuf_exit(&buf);              643                 printbuf_exit(&buf);
625                 return true;                      644                 return true;
626         }                                         645         }
627                                                   646 
628         return false;                             647         return false;
629 }                                                 648 }
630                                                   649 
                                                   >> 650 u64 bch2_fsck_recovery_passes(void)
                                                   >> 651 {
                                                   >> 652         u64 ret = 0;
                                                   >> 653 
                                                   >> 654         for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
                                                   >> 655                 if (recovery_pass_fns[i].when & PASS_FSCK)
                                                   >> 656                         ret |= BIT_ULL(i);
                                                   >> 657         return ret;
                                                   >> 658 }
                                                   >> 659 
                                                   >> 660 static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
                                                   >> 661 {
                                                   >> 662         struct recovery_pass_fn *p = recovery_pass_fns + pass;
                                                   >> 663 
                                                   >> 664         if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
                                                   >> 665                 return false;
                                                   >> 666         if (c->recovery_passes_explicit & BIT_ULL(pass))
                                                   >> 667                 return true;
                                                   >> 668         if ((p->when & PASS_FSCK) && c->opts.fsck)
                                                   >> 669                 return true;
                                                   >> 670         if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
                                                   >> 671                 return true;
                                                   >> 672         if (p->when & PASS_ALWAYS)
                                                   >> 673                 return true;
                                                   >> 674         return false;
                                                   >> 675 }
                                                   >> 676 
                                                   >> 677 static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
                                                   >> 678 {
                                                   >> 679         struct recovery_pass_fn *p = recovery_pass_fns + pass;
                                                   >> 680         int ret;
                                                   >> 681 
                                                   >> 682         if (!(p->when & PASS_SILENT))
                                                   >> 683                 bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
                                                   >> 684                            bch2_recovery_passes[pass]);
                                                   >> 685         ret = p->fn(c);
                                                   >> 686         if (ret)
                                                   >> 687                 return ret;
                                                   >> 688         if (!(p->when & PASS_SILENT))
                                                   >> 689                 bch2_print(c, KERN_CONT " done\n");
                                                   >> 690 
                                                   >> 691         return 0;
                                                   >> 692 }
                                                   >> 693 
                                                   >> 694 static int bch2_run_recovery_passes(struct bch_fs *c)
                                                   >> 695 {
                                                   >> 696         int ret = 0;
                                                   >> 697 
                                                   >> 698         while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
                                                   >> 699                 if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
                                                   >> 700                         unsigned pass = c->curr_recovery_pass;
                                                   >> 701 
                                                   >> 702                         ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
                                                   >> 703                         if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
                                                   >> 704                             (ret && c->curr_recovery_pass < pass))
                                                   >> 705                                 continue;
                                                   >> 706                         if (ret)
                                                   >> 707                                 break;
                                                   >> 708 
                                                   >> 709                         c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
                                                   >> 710                 }
                                                   >> 711                 c->curr_recovery_pass++;
                                                   >> 712                 c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
                                                   >> 713         }
                                                   >> 714 
                                                   >> 715         return ret;
                                                   >> 716 }
                                                   >> 717 
                                                   >> 718 int bch2_run_online_recovery_passes(struct bch_fs *c)
                                                   >> 719 {
                                                   >> 720         int ret = 0;
                                                   >> 721 
                                                   >> 722         for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
                                                   >> 723                 struct recovery_pass_fn *p = recovery_pass_fns + i;
                                                   >> 724 
                                                   >> 725                 if (!(p->when & PASS_ONLINE))
                                                   >> 726                         continue;
                                                   >> 727 
                                                   >> 728                 ret = bch2_run_recovery_pass(c, i);
                                                   >> 729                 if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
                                                   >> 730                         i = c->curr_recovery_pass;
                                                   >> 731                         continue;
                                                   >> 732                 }
                                                   >> 733                 if (ret)
                                                   >> 734                         break;
                                                   >> 735         }
                                                   >> 736 
                                                   >> 737         return ret;
                                                   >> 738 }
                                                   >> 739 
631 int bch2_fs_recovery(struct bch_fs *c)            740 int bch2_fs_recovery(struct bch_fs *c)
632 {                                                 741 {
633         struct bch_sb_field_clean *clean = NUL    742         struct bch_sb_field_clean *clean = NULL;
634         struct jset *last_journal_entry = NULL    743         struct jset *last_journal_entry = NULL;
635         u64 last_seq = 0, blacklist_seq, journ    744         u64 last_seq = 0, blacklist_seq, journal_seq;
636         int ret = 0;                              745         int ret = 0;
637                                                   746 
638         if (c->sb.clean) {                        747         if (c->sb.clean) {
639                 clean = bch2_read_superblock_c    748                 clean = bch2_read_superblock_clean(c);
640                 ret = PTR_ERR_OR_ZERO(clean);     749                 ret = PTR_ERR_OR_ZERO(clean);
641                 if (ret)                          750                 if (ret)
642                         goto err;                 751                         goto err;
643                                                   752 
644                 bch_info(c, "recovering from c    753                 bch_info(c, "recovering from clean shutdown, journal seq %llu",
645                          le64_to_cpu(clean->jo    754                          le64_to_cpu(clean->journal_seq));
646         } else {                                  755         } else {
647                 bch_info(c, "recovering from u    756                 bch_info(c, "recovering from unclean shutdown");
648         }                                         757         }
649                                                   758 
650         if (!(c->sb.features & (1ULL << BCH_FE    759         if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
651                 bch_err(c, "feature new_extent    760                 bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
652                 ret = -EINVAL;                    761                 ret = -EINVAL;
653                 goto err;                         762                 goto err;
654         }                                         763         }
655                                                   764 
656         if (!c->sb.clean &&                       765         if (!c->sb.clean &&
657             !(c->sb.features & (1ULL << BCH_FE    766             !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
658                 bch_err(c, "filesystem needs r    767                 bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
659                 ret = -EINVAL;                    768                 ret = -EINVAL;
660                 goto err;                         769                 goto err;
661         }                                         770         }
662                                                   771 
663         if (c->opts.norecovery)                !! 772         if (c->opts.fsck && c->opts.norecovery) {
664                 c->opts.recovery_pass_last = B !! 773                 bch_err(c, "cannot select both norecovery and fsck");
665                                                !! 774                 ret = -EINVAL;
666         mutex_lock(&c->sb_lock);               !! 775                 goto err;
667         struct bch_sb_field_ext *ext = bch2_sb << 
668         bool write_sb = false;                 << 
669                                                << 
670         if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk << 
671                 ext->recovery_passes_required[ << 
672                         cpu_to_le64(bch2_recov << 
673                 write_sb = true;               << 
674         }                                         776         }
675                                                   777 
676         u64 sb_passes = bch2_recovery_passes_f !! 778         if (!c->opts.nochanges) {
677         if (sb_passes) {                       !! 779                 mutex_lock(&c->sb_lock);
678                 struct printbuf buf = PRINTBUF !! 780                 bool write_sb = false;
679                 prt_str(&buf, "superblock requ << 
680                 prt_bitflags(&buf, bch2_recove << 
681                 bch_info(c, "%s", buf.buf);    << 
682                 printbuf_exit(&buf);           << 
683         }                                      << 
684                                                   781 
685         if (bch2_check_version_downgrade(c)) { !! 782                 struct bch_sb_field_ext *ext =
686                 struct printbuf buf = PRINTBUF !! 783                         bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
                                                   >> 784                 if (!ext) {
                                                   >> 785                         ret = -BCH_ERR_ENOSPC_sb;
                                                   >> 786                         mutex_unlock(&c->sb_lock);
                                                   >> 787                         goto err;
                                                   >> 788                 }
687                                                   789 
688                 prt_str(&buf, "Version downgra !! 790                 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
                                                   >> 791                         ext->recovery_passes_required[0] |=
                                                   >> 792                                 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
                                                   >> 793                         write_sb = true;
                                                   >> 794                 }
689                                                   795 
690                 __le64 passes = ext->recovery_ !! 796                 u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
691                 bch2_sb_set_downgrade(c,       !! 797                 if (sb_passes) {
692                                       BCH_VERS !! 798                         struct printbuf buf = PRINTBUF;
693                                       BCH_VERS !! 799                         prt_str(&buf, "superblock requires following recovery passes to be run:\n  ");
694                 passes = ext->recovery_passes_ !! 800                         prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
695                 if (passes) {                  !! 801                         bch_info(c, "%s", buf.buf);
696                         prt_str(&buf, "\n  run !! 802                         printbuf_exit(&buf);
697                         prt_bitflags(&buf, bch << 
698                                      bch2_reco << 
699                 }                                 803                 }
700                                                   804 
701                 bch_info(c, "%s", buf.buf);    !! 805                 if (bch2_check_version_downgrade(c)) {
702                 printbuf_exit(&buf);           !! 806                         struct printbuf buf = PRINTBUF;
703                 write_sb = true;               << 
704         }                                      << 
705                                                   807 
706         if (check_version_upgrade(c))          !! 808                         prt_str(&buf, "Version downgrade required:");
707                 write_sb = true;               << 
708                                                   809 
709         c->opts.recovery_passes |= bch2_recove !! 810                         __le64 passes = ext->recovery_passes_required[0];
                                                   >> 811                         bch2_sb_set_downgrade(c,
                                                   >> 812                                         BCH_VERSION_MINOR(bcachefs_metadata_version_current),
                                                   >> 813                                         BCH_VERSION_MINOR(c->sb.version));
                                                   >> 814                         passes = ext->recovery_passes_required[0] & ~passes;
                                                   >> 815                         if (passes) {
                                                   >> 816                                 prt_str(&buf, "\n  running recovery passes: ");
                                                   >> 817                                 prt_bitflags(&buf, bch2_recovery_passes,
                                                   >> 818                                              bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
                                                   >> 819                         }
710                                                   820 
711         if (write_sb)                          !! 821                         bch_info(c, "%s", buf.buf);
712                 bch2_write_super(c);           !! 822                         printbuf_exit(&buf);
713         mutex_unlock(&c->sb_lock);             !! 823                         write_sb = true;
                                                   >> 824                 }
                                                   >> 825 
                                                   >> 826                 if (check_version_upgrade(c))
                                                   >> 827                         write_sb = true;
                                                   >> 828 
                                                   >> 829                 if (write_sb)
                                                   >> 830                         bch2_write_super(c);
                                                   >> 831 
                                                   >> 832                 c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
                                                   >> 833                 mutex_unlock(&c->sb_lock);
                                                   >> 834         }
714                                                   835 
715         if (c->opts.fsck && IS_ENABLED(CONFIG_    836         if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
716                 c->opts.recovery_passes |= BIT !! 837                 c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
717                                                   838 
718         if (c->opts.fsck)                         839         if (c->opts.fsck)
719                 set_bit(BCH_FS_fsck_running, &    840                 set_bit(BCH_FS_fsck_running, &c->flags);
720         if (c->sb.clean)                       << 
721                 set_bit(BCH_FS_clean_recovery, << 
722                                                   841 
723         ret = bch2_blacklist_table_initialize(    842         ret = bch2_blacklist_table_initialize(c);
724         if (ret) {                                843         if (ret) {
725                 bch_err(c, "error initializing    844                 bch_err(c, "error initializing blacklist table");
726                 goto err;                         845                 goto err;
727         }                                         846         }
728                                                   847 
729         bch2_journal_pos_from_member_info_resu !! 848         if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
730                                                << 
731         if (!c->sb.clean || c->opts.retain_rec << 
732                 struct genradix_iter iter;        849                 struct genradix_iter iter;
733                 struct journal_replay **i;        850                 struct journal_replay **i;
734                                                   851 
735                 bch_verbose(c, "starting journ    852                 bch_verbose(c, "starting journal read");
736                 ret = bch2_journal_read(c, &la    853                 ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
737                 if (ret)                          854                 if (ret)
738                         goto err;                 855                         goto err;
739                                                   856 
740                 /*                                857                 /*
741                  * note: cmd_list_journal need    858                  * note: cmd_list_journal needs the blacklist table fully up to date so
742                  * it can asterisk ignored jou    859                  * it can asterisk ignored journal entries:
743                  */                               860                  */
744                 if (c->opts.read_journal_only)    861                 if (c->opts.read_journal_only)
745                         goto out;                 862                         goto out;
746                                                   863 
747                 genradix_for_each_reverse(&c->    864                 genradix_for_each_reverse(&c->journal_entries, iter, i)
748                         if (!journal_replay_ig !! 865                         if (*i && !(*i)->ignore) {
749                                 last_journal_e    866                                 last_journal_entry = &(*i)->j;
750                                 break;            867                                 break;
751                         }                         868                         }
752                                                   869 
753                 if (mustfix_fsck_err_on(c->sb.    870                 if (mustfix_fsck_err_on(c->sb.clean &&
754                                         last_j    871                                         last_journal_entry &&
755                                         !journ    872                                         !journal_entry_empty(last_journal_entry), c,
756                                 clean_but_jour    873                                 clean_but_journal_not_empty,
757                                 "filesystem ma    874                                 "filesystem marked clean but journal not empty")) {
758                         c->sb.compat &= ~(1ULL    875                         c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
759                         SET_BCH_SB_CLEAN(c->di    876                         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
760                         c->sb.clean = false;      877                         c->sb.clean = false;
761                 }                                 878                 }
762                                                   879 
763                 if (!last_journal_entry) {        880                 if (!last_journal_entry) {
764                         fsck_err_on(!c->sb.cle    881                         fsck_err_on(!c->sb.clean, c,
765                                     dirty_but_    882                                     dirty_but_no_journal_entries,
766                                     "no journa    883                                     "no journal entries found");
767                         if (clean)                884                         if (clean)
768                                 goto use_clean    885                                 goto use_clean;
769                                                   886 
770                         genradix_for_each_reve    887                         genradix_for_each_reverse(&c->journal_entries, iter, i)
771                                 if (*i) {         888                                 if (*i) {
772                                         last_j    889                                         last_journal_entry = &(*i)->j;
773                                         (*i)-> !! 890                                         (*i)->ignore = false;
774                                         (*i)-> << 
775                                         /*        891                                         /*
776                                          * Thi    892                                          * This was probably a NO_FLUSH entry,
777                                          * so     893                                          * so last_seq was garbage - but we know
778                                          * we'    894                                          * we're only using a single journal
779                                          * ent    895                                          * entry, set it here:
780                                          */       896                                          */
781                                         (*i)->    897                                         (*i)->j.last_seq = (*i)->j.seq;
782                                         break;    898                                         break;
783                                 }                 899                                 }
784                 }                                 900                 }
785                                                   901 
786                 ret = bch2_journal_keys_sort(c    902                 ret = bch2_journal_keys_sort(c);
787                 if (ret)                          903                 if (ret)
788                         goto err;                 904                         goto err;
789                                                   905 
790                 if (c->sb.clean && last_journa    906                 if (c->sb.clean && last_journal_entry) {
791                         ret = bch2_verify_supe    907                         ret = bch2_verify_superblock_clean(c, &clean,
792                                                   908                                                       last_journal_entry);
793                         if (ret)                  909                         if (ret)
794                                 goto err;         910                                 goto err;
795                 }                                 911                 }
796         } else {                                  912         } else {
797 use_clean:                                        913 use_clean:
798                 if (!clean) {                     914                 if (!clean) {
799                         bch_err(c, "no superbl    915                         bch_err(c, "no superblock clean section found");
800                         ret = -BCH_ERR_fsck_re    916                         ret = -BCH_ERR_fsck_repair_impossible;
801                         goto err;                 917                         goto err;
802                                                   918 
803                 }                                 919                 }
804                 blacklist_seq = journal_seq =     920                 blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
805         }                                         921         }
806                                                   922 
807         c->journal_replay_seq_start     = last    923         c->journal_replay_seq_start     = last_seq;
808         c->journal_replay_seq_end       = blac    924         c->journal_replay_seq_end       = blacklist_seq - 1;
809                                                   925 
810         if (c->opts.reconstruct_alloc)         !! 926         if (c->opts.reconstruct_alloc) {
811                 bch2_reconstruct_alloc(c);     !! 927                 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                                                   >> 928                 drop_alloc_keys(&c->journal_keys);
                                                   >> 929         }
812                                                   930 
813         zero_out_btree_mem_ptr(&c->journal_key    931         zero_out_btree_mem_ptr(&c->journal_keys);
814                                                   932 
815         ret = journal_replay_early(c, clean);     933         ret = journal_replay_early(c, clean);
816         if (ret)                                  934         if (ret)
817                 goto err;                         935                 goto err;
818                                                   936 
819         /*                                        937         /*
820          * After an unclean shutdown, skip the    938          * After an unclean shutdown, skip then next few journal sequence
821          * numbers as they may have been refer    939          * numbers as they may have been referenced by btree writes that
822          * happened before their corresponding    940          * happened before their corresponding journal writes - those btree
823          * writes need to be ignored, by skipp    941          * writes need to be ignored, by skipping and blacklisting the next few
824          * journal sequence numbers:              942          * journal sequence numbers:
825          */                                       943          */
826         if (!c->sb.clean)                         944         if (!c->sb.clean)
827                 journal_seq += 8;                 945                 journal_seq += 8;
828                                                   946 
829         if (blacklist_seq != journal_seq) {       947         if (blacklist_seq != journal_seq) {
830                 ret =   bch2_journal_log_msg(c    948                 ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
831                                              b    949                                              blacklist_seq, journal_seq) ?:
832                         bch2_journal_seq_black    950                         bch2_journal_seq_blacklist_add(c,
833                                         blackl    951                                         blacklist_seq, journal_seq);
834                 if (ret) {                        952                 if (ret) {
835                         bch_err_msg(c, ret, "e !! 953                         bch_err(c, "error creating new journal seq blacklist entry");
836                         goto err;                 954                         goto err;
837                 }                                 955                 }
838         }                                         956         }
839                                                   957 
840         ret =   bch2_journal_log_msg(c, "start    958         ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
841                                      journal_s    959                                      journal_seq, last_seq, blacklist_seq - 1) ?:
842                 bch2_fs_journal_start(&c->jour    960                 bch2_fs_journal_start(&c->journal, journal_seq);
843         if (ret)                                  961         if (ret)
844                 goto err;                         962                 goto err;
845                                                   963 
                                                   >> 964         if (c->opts.reconstruct_alloc)
                                                   >> 965                 bch2_journal_log_msg(c, "dropping alloc info");
                                                   >> 966 
846         /*                                        967         /*
847          * Skip past versions that might have     968          * Skip past versions that might have possibly been used (as nonces),
848          * but hadn't had their pointers writt    969          * but hadn't had their pointers written:
849          */                                       970          */
850         if (c->sb.encryption_type && !c->sb.cl    971         if (c->sb.encryption_type && !c->sb.clean)
851                 atomic64_add(1 << 16, &c->key_    972                 atomic64_add(1 << 16, &c->key_version);
852                                                   973 
853         ret = read_btree_roots(c);                974         ret = read_btree_roots(c);
854         if (ret)                                  975         if (ret)
855                 goto err;                         976                 goto err;
856                                                   977 
857         set_bit(BCH_FS_btree_running, &c->flag << 
858                                                << 
859         ret = bch2_sb_set_upgrade_extra(c);    << 
860                                                << 
861         ret = bch2_run_recovery_passes(c);        978         ret = bch2_run_recovery_passes(c);
862         if (ret)                                  979         if (ret)
863                 goto err;                         980                 goto err;
864                                                   981 
865         /*                                     << 
866          * Normally set by the appropriate rec << 
867          * indicates we're in early recovery a << 
868          * being applied to the journal replay << 
869          * multithreaded use:                  << 
870          */                                    << 
871         set_bit(BCH_FS_may_go_rw, &c->flags);  << 
872         clear_bit(BCH_FS_fsck_running, &c->fla    982         clear_bit(BCH_FS_fsck_running, &c->flags);
873                                                   983 
874         /* in case we don't run journal replay << 
875         set_bit(BCH_FS_accounting_replay_done, << 
876                                                << 
877         /* fsync if we fixed errors */         << 
878         if (test_bit(BCH_FS_errors_fixed, &c-> << 
879             bch2_write_ref_tryget(c, BCH_WRITE << 
880                 bch2_journal_flush_all_pins(&c << 
881                 bch2_journal_meta(&c->journal) << 
882                 bch2_write_ref_put(c, BCH_WRIT << 
883         }                                      << 
884                                                << 
885         /* If we fixed errors, verify that fs     984         /* If we fixed errors, verify that fs is actually clean now: */
886         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)     985         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
887             test_bit(BCH_FS_errors_fixed, &c->    986             test_bit(BCH_FS_errors_fixed, &c->flags) &&
888             !test_bit(BCH_FS_errors_not_fixed,    987             !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
889             !test_bit(BCH_FS_error, &c->flags)    988             !test_bit(BCH_FS_error, &c->flags)) {
890                 bch2_flush_fsck_errs(c);          989                 bch2_flush_fsck_errs(c);
891                                                   990 
892                 bch_info(c, "Fixed errors, run    991                 bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
893                 clear_bit(BCH_FS_errors_fixed,    992                 clear_bit(BCH_FS_errors_fixed, &c->flags);
894                                                   993 
895                 c->curr_recovery_pass = BCH_RE    994                 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
896                                                   995 
897                 ret = bch2_run_recovery_passes    996                 ret = bch2_run_recovery_passes(c);
898                 if (ret)                          997                 if (ret)
899                         goto err;                 998                         goto err;
900                                                   999 
901                 if (test_bit(BCH_FS_errors_fix    1000                 if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
902                     test_bit(BCH_FS_errors_not    1001                     test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
903                         bch_err(c, "Second fsc    1002                         bch_err(c, "Second fsck run was not clean");
904                         set_bit(BCH_FS_errors_    1003                         set_bit(BCH_FS_errors_not_fixed, &c->flags);
905                 }                                 1004                 }
906                                                   1005 
907                 set_bit(BCH_FS_errors_fixed, &    1006                 set_bit(BCH_FS_errors_fixed, &c->flags);
908         }                                         1007         }
909                                                   1008 
910         if (enabled_qtypes(c)) {                  1009         if (enabled_qtypes(c)) {
911                 bch_verbose(c, "reading quotas    1010                 bch_verbose(c, "reading quotas");
912                 ret = bch2_fs_quota_read(c);      1011                 ret = bch2_fs_quota_read(c);
913                 if (ret)                          1012                 if (ret)
914                         goto err;                 1013                         goto err;
915                 bch_verbose(c, "quotas done");    1014                 bch_verbose(c, "quotas done");
916         }                                         1015         }
917                                                   1016 
918         mutex_lock(&c->sb_lock);                  1017         mutex_lock(&c->sb_lock);
919         ext = bch2_sb_field_get(c->disk_sb.sb, !! 1018         bool write_sb = false;
920         write_sb = false;                      << 
921                                                   1019 
922         if (BCH_SB_VERSION_UPGRADE_COMPLETE(c-    1020         if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
923                 SET_BCH_SB_VERSION_UPGRADE_COM    1021                 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
924                 write_sb = true;                  1022                 write_sb = true;
925         }                                         1023         }
926                                                   1024 
927         if (!test_bit(BCH_FS_error, &c->flags)    1025         if (!test_bit(BCH_FS_error, &c->flags) &&
928             !(c->disk_sb.sb->compat[0] & cpu_t    1026             !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
929                 c->disk_sb.sb->compat[0] |= cp    1027                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
930                 write_sb = true;                  1028                 write_sb = true;
931         }                                         1029         }
932                                                   1030 
933         if (!test_bit(BCH_FS_error, &c->flags) !! 1031         if (!test_bit(BCH_FS_error, &c->flags)) {
934             !bch2_is_zero(ext->errors_silent,  !! 1032                 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
935                 memset(ext->errors_silent, 0,  !! 1033                 if (ext &&
936                 write_sb = true;               !! 1034                     (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
937         }                                      !! 1035                      !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
938                                                !! 1036                         memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
939         if (c->opts.fsck &&                    !! 1037                         memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
940             !test_bit(BCH_FS_error, &c->flags) !! 1038                         write_sb = true;
941             c->recovery_pass_done == BCH_RECOV !! 1039                 }
942             ext->btrees_lost_data) {           << 
943                 ext->btrees_lost_data = 0;     << 
944                 write_sb = true;               << 
945         }                                         1040         }
946                                                   1041 
947         if (c->opts.fsck &&                       1042         if (c->opts.fsck &&
948             !test_bit(BCH_FS_error, &c->flags)    1043             !test_bit(BCH_FS_error, &c->flags) &&
949             !test_bit(BCH_FS_errors_not_fixed,    1044             !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
950                 SET_BCH_SB_HAS_ERRORS(c->disk_    1045                 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
951                 SET_BCH_SB_HAS_TOPOLOGY_ERRORS    1046                 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
952                 write_sb = true;                  1047                 write_sb = true;
953         }                                         1048         }
954                                                   1049 
955         if (bch2_blacklist_entries_gc(c))      << 
956                 write_sb = true;               << 
957                                                << 
958         if (write_sb)                             1050         if (write_sb)
959                 bch2_write_super(c);              1051                 bch2_write_super(c);
960         mutex_unlock(&c->sb_lock);                1052         mutex_unlock(&c->sb_lock);
961                                                   1053 
962         if (!(c->sb.compat & (1ULL << BCH_COMP    1054         if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
963             c->sb.version_min < bcachefs_metad    1055             c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
964                 struct bch_move_stats stats;      1056                 struct bch_move_stats stats;
965                                                   1057 
966                 bch2_move_stats_init(&stats, "    1058                 bch2_move_stats_init(&stats, "recovery");
967                                                   1059 
968                 struct printbuf buf = PRINTBUF    1060                 struct printbuf buf = PRINTBUF;
969                 bch2_version_to_text(&buf, c->    1061                 bch2_version_to_text(&buf, c->sb.version_min);
970                 bch_info(c, "scanning for old     1062                 bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
971                 printbuf_exit(&buf);              1063                 printbuf_exit(&buf);
972                                                   1064 
973                 ret =   bch2_fs_read_write_ear    1065                 ret =   bch2_fs_read_write_early(c) ?:
974                         bch2_scan_old_btree_no    1066                         bch2_scan_old_btree_nodes(c, &stats);
975                 if (ret)                          1067                 if (ret)
976                         goto err;                 1068                         goto err;
977                 bch_info(c, "scanning for old     1069                 bch_info(c, "scanning for old btree nodes done");
978         }                                         1070         }
979                                                   1071 
                                                   >> 1072         if (c->journal_seq_blacklist_table &&
                                                   >> 1073             c->journal_seq_blacklist_table->nr > 128)
                                                   >> 1074                 queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
                                                   >> 1075 
980         ret = 0;                                  1076         ret = 0;
981 out:                                              1077 out:
982         bch2_flush_fsck_errs(c);                  1078         bch2_flush_fsck_errs(c);
983                                                   1079 
984         if (!c->opts.retain_recovery_info) {   !! 1080         if (!c->opts.keep_journal &&
                                                   >> 1081             test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
985                 bch2_journal_keys_put_initial(    1082                 bch2_journal_keys_put_initial(c);
986                 bch2_find_btree_nodes_exit(&c- !! 1083         kfree(clean);
987         }                                      << 
988         if (!IS_ERR(clean))                    << 
989                 kfree(clean);                  << 
990                                                   1084 
991         if (!ret &&                               1085         if (!ret &&
992             test_bit(BCH_FS_need_delete_dead_s    1086             test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
993             !c->opts.nochanges) {                 1087             !c->opts.nochanges) {
994                 bch2_fs_read_write_early(c);      1088                 bch2_fs_read_write_early(c);
995                 bch2_delete_dead_snapshots_asy    1089                 bch2_delete_dead_snapshots_async(c);
996         }                                         1090         }
997                                                   1091 
998         bch_err_fn(c, ret);                       1092         bch_err_fn(c, ret);
999         return ret;                               1093         return ret;
1000 err:                                             1094 err:
1001 fsck_err:                                        1095 fsck_err:
1002         bch2_fs_emergency_read_only(c);          1096         bch2_fs_emergency_read_only(c);
1003         goto out;                                1097         goto out;
1004 }                                                1098 }
1005                                                  1099 
1006 int bch2_fs_initialize(struct bch_fs *c)         1100 int bch2_fs_initialize(struct bch_fs *c)
1007 {                                                1101 {
1008         struct bch_inode_unpacked root_inode,    1102         struct bch_inode_unpacked root_inode, lostfound_inode;
1009         struct bkey_inode_buf packed_inode;      1103         struct bkey_inode_buf packed_inode;
1010         struct qstr lostfound = QSTR("lost+fo    1104         struct qstr lostfound = QSTR("lost+found");
1011         struct bch_member *m;                 << 
1012         int ret;                                 1105         int ret;
1013                                                  1106 
1014         bch_notice(c, "initializing new files    1107         bch_notice(c, "initializing new filesystem");
1015         set_bit(BCH_FS_new_fs, &c->flags);    << 
1016                                                  1108 
1017         mutex_lock(&c->sb_lock);                 1109         mutex_lock(&c->sb_lock);
1018         c->disk_sb.sb->compat[0] |= cpu_to_le    1110         c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1019         c->disk_sb.sb->compat[0] |= cpu_to_le    1111         c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1020                                                  1112 
1021         bch2_check_version_downgrade(c);         1113         bch2_check_version_downgrade(c);
1022                                                  1114 
1023         if (c->opts.version_upgrade != BCH_VE    1115         if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
1024                 bch2_sb_upgrade(c, bcachefs_m    1116                 bch2_sb_upgrade(c, bcachefs_metadata_version_current);
1025                 SET_BCH_SB_VERSION_UPGRADE_CO    1117                 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
1026                 bch2_write_super(c);             1118                 bch2_write_super(c);
1027         }                                        1119         }
1028                                               << 
1029         for_each_member_device(c, ca) {       << 
1030                 m = bch2_members_v2_get_mut(c << 
1031                 SET_BCH_MEMBER_FREESPACE_INIT << 
1032                 ca->mi = bch2_mi_to_cpu(m);   << 
1033         }                                     << 
1034                                               << 
1035         bch2_write_super(c);                  << 
1036         mutex_unlock(&c->sb_lock);               1120         mutex_unlock(&c->sb_lock);
1037                                                  1121 
1038         c->curr_recovery_pass = BCH_RECOVERY_ !! 1122         c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
1039         set_bit(BCH_FS_btree_running, &c->fla << 
1040         set_bit(BCH_FS_may_go_rw, &c->flags);    1123         set_bit(BCH_FS_may_go_rw, &c->flags);
1041                                                  1124 
1042         for (unsigned i = 0; i < BTREE_ID_NR;    1125         for (unsigned i = 0; i < BTREE_ID_NR; i++)
1043                 bch2_btree_root_alloc_fake(c, !! 1126                 bch2_btree_root_alloc(c, i);
                                                   >> 1127 
                                                   >> 1128         for_each_member_device(c, ca)
                                                   >> 1129                 bch2_dev_usage_init(ca);
1044                                                  1130 
1045         ret = bch2_fs_journal_alloc(c);          1131         ret = bch2_fs_journal_alloc(c);
1046         if (ret)                                 1132         if (ret)
1047                 goto err;                        1133                 goto err;
1048                                                  1134 
1049         /*                                       1135         /*
1050          * journal_res_get() will crash if ca    1136          * journal_res_get() will crash if called before this has
1051          * set up the journal.pin FIFO and jo    1137          * set up the journal.pin FIFO and journal.cur pointer:
1052          */                                      1138          */
1053         bch2_fs_journal_start(&c->journal, 1)    1139         bch2_fs_journal_start(&c->journal, 1);
1054         set_bit(BCH_FS_accounting_replay_done << 
1055         bch2_journal_set_replay_done(&c->jour    1140         bch2_journal_set_replay_done(&c->journal);
1056                                                  1141 
1057         ret = bch2_fs_read_write_early(c);       1142         ret = bch2_fs_read_write_early(c);
1058         if (ret)                                 1143         if (ret)
1059                 goto err;                        1144                 goto err;
1060                                                  1145 
1061         for_each_member_device(c, ca) {       << 
1062                 ret = bch2_dev_usage_init(ca, << 
1063                 if (ret) {                    << 
1064                         bch2_dev_put(ca);     << 
1065                         goto err;             << 
1066                 }                             << 
1067         }                                     << 
1068                                               << 
1069         /*                                       1146         /*
1070          * Write out the superblock and journ    1147          * Write out the superblock and journal buckets, now that we can do
1071          * btree updates                         1148          * btree updates
1072          */                                      1149          */
1073         bch_verbose(c, "marking superblocks")    1150         bch_verbose(c, "marking superblocks");
1074         ret = bch2_trans_mark_dev_sbs(c);        1151         ret = bch2_trans_mark_dev_sbs(c);
1075         bch_err_msg(c, ret, "marking superblo    1152         bch_err_msg(c, ret, "marking superblocks");
1076         if (ret)                                 1153         if (ret)
1077                 goto err;                        1154                 goto err;
1078                                                  1155 
1079         for_each_online_member(c, ca)            1156         for_each_online_member(c, ca)
1080                 ca->new_fs_bucket_idx = 0;       1157                 ca->new_fs_bucket_idx = 0;
1081                                                  1158 
1082         ret = bch2_fs_freespace_init(c);         1159         ret = bch2_fs_freespace_init(c);
1083         if (ret)                                 1160         if (ret)
1084                 goto err;                        1161                 goto err;
1085                                                  1162 
1086         ret = bch2_initialize_subvolumes(c);     1163         ret = bch2_initialize_subvolumes(c);
1087         if (ret)                                 1164         if (ret)
1088                 goto err;                        1165                 goto err;
1089                                                  1166 
1090         bch_verbose(c, "reading snapshots tab    1167         bch_verbose(c, "reading snapshots table");
1091         ret = bch2_snapshots_read(c);            1168         ret = bch2_snapshots_read(c);
1092         if (ret)                                 1169         if (ret)
1093                 goto err;                        1170                 goto err;
1094         bch_verbose(c, "reading snapshots don    1171         bch_verbose(c, "reading snapshots done");
1095                                                  1172 
1096         bch2_inode_init(c, &root_inode, 0, 0,    1173         bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
1097         root_inode.bi_inum      = BCACHEFS_RO    1174         root_inode.bi_inum      = BCACHEFS_ROOT_INO;
1098         root_inode.bi_subvol    = BCACHEFS_RO    1175         root_inode.bi_subvol    = BCACHEFS_ROOT_SUBVOL;
1099         bch2_inode_pack(&packed_inode, &root_    1176         bch2_inode_pack(&packed_inode, &root_inode);
1100         packed_inode.inode.k.p.snapshot = U32    1177         packed_inode.inode.k.p.snapshot = U32_MAX;
1101                                                  1178 
1102         ret = bch2_btree_insert(c, BTREE_ID_i !! 1179         ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
1103         bch_err_msg(c, ret, "creating root di    1180         bch_err_msg(c, ret, "creating root directory");
1104         if (ret)                                 1181         if (ret)
1105                 goto err;                        1182                 goto err;
1106                                                  1183 
1107         bch2_inode_init_early(c, &lostfound_i    1184         bch2_inode_init_early(c, &lostfound_inode);
1108                                                  1185 
1109         ret = bch2_trans_commit_do(c, NULL, N !! 1186         ret = bch2_trans_do(c, NULL, NULL, 0,
1110                 bch2_create_trans(trans,         1187                 bch2_create_trans(trans,
1111                                   BCACHEFS_RO    1188                                   BCACHEFS_ROOT_SUBVOL_INUM,
1112                                   &root_inode    1189                                   &root_inode, &lostfound_inode,
1113                                   &lostfound,    1190                                   &lostfound,
1114                                   0, 0, S_IFD    1191                                   0, 0, S_IFDIR|0700, 0,
1115                                   NULL, NULL,    1192                                   NULL, NULL, (subvol_inum) { 0 }, 0));
1116         bch_err_msg(c, ret, "creating lost+fo    1193         bch_err_msg(c, ret, "creating lost+found");
1117         if (ret)                                 1194         if (ret)
1118                 goto err;                        1195                 goto err;
1119                                                  1196 
1120         c->recovery_pass_done = BCH_RECOVERY_ !! 1197         c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
1121                                                  1198 
1122         if (enabled_qtypes(c)) {                 1199         if (enabled_qtypes(c)) {
1123                 ret = bch2_fs_quota_read(c);     1200                 ret = bch2_fs_quota_read(c);
1124                 if (ret)                         1201                 if (ret)
1125                         goto err;                1202                         goto err;
1126         }                                        1203         }
1127                                                  1204 
1128         ret = bch2_journal_flush(&c->journal)    1205         ret = bch2_journal_flush(&c->journal);
1129         bch_err_msg(c, ret, "writing first jo    1206         bch_err_msg(c, ret, "writing first journal entry");
1130         if (ret)                                 1207         if (ret)
1131                 goto err;                        1208                 goto err;
1132                                                  1209 
1133         mutex_lock(&c->sb_lock);                 1210         mutex_lock(&c->sb_lock);
1134         SET_BCH_SB_INITIALIZED(c->disk_sb.sb,    1211         SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
1135         SET_BCH_SB_CLEAN(c->disk_sb.sb, false    1212         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
1136                                                  1213 
1137         bch2_write_super(c);                     1214         bch2_write_super(c);
1138         mutex_unlock(&c->sb_lock);               1215         mutex_unlock(&c->sb_lock);
1139                                                  1216 
1140         return 0;                                1217         return 0;
1141 err:                                             1218 err:
1142         bch_err_fn(c, ret);                      1219         bch_err_fn(c, ret);
1143         return ret;                              1220         return ret;
1144 }                                                1221 }
1145                                                  1222 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php