~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/alloc_background.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 #include "bcachefs.h"
  3 #include "alloc_background.h"
  4 #include "alloc_foreground.h"
  5 #include "backpointers.h"
  6 #include "bkey_buf.h"
  7 #include "btree_cache.h"
  8 #include "btree_io.h"
  9 #include "btree_key_cache.h"
 10 #include "btree_update.h"
 11 #include "btree_update_interior.h"
 12 #include "btree_gc.h"
 13 #include "btree_write_buffer.h"
 14 #include "buckets.h"
 15 #include "buckets_waiting_for_journal.h"
 16 #include "clock.h"
 17 #include "debug.h"
 18 #include "disk_accounting.h"
 19 #include "ec.h"
 20 #include "error.h"
 21 #include "lru.h"
 22 #include "recovery.h"
 23 #include "trace.h"
 24 #include "varint.h"
 25 
 26 #include <linux/kthread.h>
 27 #include <linux/math64.h>
 28 #include <linux/random.h>
 29 #include <linux/rculist.h>
 30 #include <linux/rcupdate.h>
 31 #include <linux/sched/task.h>
 32 #include <linux/sort.h>
 33 
 34 static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
 35 
 36 /* Persistent alloc info: */
 37 
 38 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 39 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
 40         BCH_ALLOC_FIELDS_V1()
 41 #undef x
 42 };
 43 
 44 struct bkey_alloc_unpacked {
 45         u64             journal_seq;
 46         u8              gen;
 47         u8              oldest_gen;
 48         u8              data_type;
 49         bool            need_discard:1;
 50         bool            need_inc_gen:1;
 51 #define x(_name, _bits) u##_bits _name;
 52         BCH_ALLOC_FIELDS_V2()
 53 #undef  x
 54 };
 55 
 56 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
 57                                      const void **p, unsigned field)
 58 {
 59         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 60         u64 v;
 61 
 62         if (!(a->fields & (1 << field)))
 63                 return 0;
 64 
 65         switch (bytes) {
 66         case 1:
 67                 v = *((const u8 *) *p);
 68                 break;
 69         case 2:
 70                 v = le16_to_cpup(*p);
 71                 break;
 72         case 4:
 73                 v = le32_to_cpup(*p);
 74                 break;
 75         case 8:
 76                 v = le64_to_cpup(*p);
 77                 break;
 78         default:
 79                 BUG();
 80         }
 81 
 82         *p += bytes;
 83         return v;
 84 }
 85 
 86 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
 87                                  struct bkey_s_c k)
 88 {
 89         const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
 90         const void *d = in->data;
 91         unsigned idx = 0;
 92 
 93         out->gen = in->gen;
 94 
 95 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
 96         BCH_ALLOC_FIELDS_V1()
 97 #undef  x
 98 }
 99 
100 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
101                                 struct bkey_s_c k)
102 {
103         struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
104         const u8 *in = a.v->data;
105         const u8 *end = bkey_val_end(a);
106         unsigned fieldnr = 0;
107         int ret;
108         u64 v;
109 
110         out->gen        = a.v->gen;
111         out->oldest_gen = a.v->oldest_gen;
112         out->data_type  = a.v->data_type;
113 
114 #define x(_name, _bits)                                                 \
115         if (fieldnr < a.v->nr_fields) {                                 \
116                 ret = bch2_varint_decode_fast(in, end, &v);             \
117                 if (ret < 0)                                            \
118                         return ret;                                     \
119                 in += ret;                                              \
120         } else {                                                        \
121                 v = 0;                                                  \
122         }                                                               \
123         out->_name = v;                                                 \
124         if (v != out->_name)                                            \
125                 return -1;                                              \
126         fieldnr++;
127 
128         BCH_ALLOC_FIELDS_V2()
129 #undef  x
130         return 0;
131 }
132 
133 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
134                                 struct bkey_s_c k)
135 {
136         struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
137         const u8 *in = a.v->data;
138         const u8 *end = bkey_val_end(a);
139         unsigned fieldnr = 0;
140         int ret;
141         u64 v;
142 
143         out->gen        = a.v->gen;
144         out->oldest_gen = a.v->oldest_gen;
145         out->data_type  = a.v->data_type;
146         out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
147         out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
148         out->journal_seq = le64_to_cpu(a.v->journal_seq);
149 
150 #define x(_name, _bits)                                                 \
151         if (fieldnr < a.v->nr_fields) {                                 \
152                 ret = bch2_varint_decode_fast(in, end, &v);             \
153                 if (ret < 0)                                            \
154                         return ret;                                     \
155                 in += ret;                                              \
156         } else {                                                        \
157                 v = 0;                                                  \
158         }                                                               \
159         out->_name = v;                                                 \
160         if (v != out->_name)                                            \
161                 return -1;                                              \
162         fieldnr++;
163 
164         BCH_ALLOC_FIELDS_V2()
165 #undef  x
166         return 0;
167 }
168 
169 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
170 {
171         struct bkey_alloc_unpacked ret = { .gen = 0 };
172 
173         switch (k.k->type) {
174         case KEY_TYPE_alloc:
175                 bch2_alloc_unpack_v1(&ret, k);
176                 break;
177         case KEY_TYPE_alloc_v2:
178                 bch2_alloc_unpack_v2(&ret, k);
179                 break;
180         case KEY_TYPE_alloc_v3:
181                 bch2_alloc_unpack_v3(&ret, k);
182                 break;
183         }
184 
185         return ret;
186 }
187 
188 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
189 {
190         unsigned i, bytes = offsetof(struct bch_alloc, data);
191 
192         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
193                 if (a->fields & (1 << i))
194                         bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
195 
196         return DIV_ROUND_UP(bytes, sizeof(u64));
197 }
198 
199 int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
200                           enum bch_validate_flags flags,
201                           struct printbuf *err)
202 {
203         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
204         int ret = 0;
205 
206         /* allow for unknown fields */
207         bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
208                          alloc_v1_val_size_bad,
209                          "incorrect value size (%zu < %u)",
210                          bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
211 fsck_err:
212         return ret;
213 }
214 
215 int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
216                           enum bch_validate_flags flags,
217                           struct printbuf *err)
218 {
219         struct bkey_alloc_unpacked u;
220         int ret = 0;
221 
222         bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
223                          alloc_v2_unpack_error,
224                          "unpack error");
225 fsck_err:
226         return ret;
227 }
228 
229 int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
230                           enum bch_validate_flags flags,
231                           struct printbuf *err)
232 {
233         struct bkey_alloc_unpacked u;
234         int ret = 0;
235 
236         bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
237                          alloc_v2_unpack_error,
238                          "unpack error");
239 fsck_err:
240         return ret;
241 }
242 
243 int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
244                           enum bch_validate_flags flags, struct printbuf *err)
245 {
246         struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
247         int ret = 0;
248 
249         bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
250                          alloc_v4_val_size_bad,
251                          "bad val size (%u > %zu)",
252                          alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
253 
254         bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
255                          BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
256                          alloc_v4_backpointers_start_bad,
257                          "invalid backpointers_start");
258 
259         bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
260                          alloc_key_data_type_bad,
261                          "invalid data type (got %u should be %u)",
262                          a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
263 
264         for (unsigned i = 0; i < 2; i++)
265                 bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
266                                  c, err,
267                                  alloc_key_io_time_bad,
268                                  "invalid io_time[%s]: %llu, max %llu",
269                                  i == READ ? "read" : "write",
270                                  a.v->io_time[i], LRU_TIME_MAX);
271 
272         unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
273                 offsetof(struct bch_alloc_v4, stripe_sectors)
274                 ? a.v->stripe_sectors
275                 : 0;
276 
277         switch (a.v->data_type) {
278         case BCH_DATA_free:
279         case BCH_DATA_need_gc_gens:
280         case BCH_DATA_need_discard:
281                 bkey_fsck_err_on(stripe_sectors ||
282                                  a.v->dirty_sectors ||
283                                  a.v->cached_sectors ||
284                                  a.v->stripe,
285                                  c, err, alloc_key_empty_but_have_data,
286                                  "empty data type free but have data %u.%u.%u %u",
287                                  stripe_sectors,
288                                  a.v->dirty_sectors,
289                                  a.v->cached_sectors,
290                                  a.v->stripe);
291                 break;
292         case BCH_DATA_sb:
293         case BCH_DATA_journal:
294         case BCH_DATA_btree:
295         case BCH_DATA_user:
296         case BCH_DATA_parity:
297                 bkey_fsck_err_on(!a.v->dirty_sectors &&
298                                  !stripe_sectors,
299                                  c, err, alloc_key_dirty_sectors_0,
300                                  "data_type %s but dirty_sectors==0",
301                                  bch2_data_type_str(a.v->data_type));
302                 break;
303         case BCH_DATA_cached:
304                 bkey_fsck_err_on(!a.v->cached_sectors ||
305                                  a.v->dirty_sectors ||
306                                  stripe_sectors ||
307                                  a.v->stripe,
308                                  c, err, alloc_key_cached_inconsistency,
309                                  "data type inconsistency");
310 
311                 bkey_fsck_err_on(!a.v->io_time[READ] &&
312                                  c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
313                                  c, err, alloc_key_cached_but_read_time_zero,
314                                  "cached bucket with read_time == 0");
315                 break;
316         case BCH_DATA_stripe:
317                 break;
318         }
319 fsck_err:
320         return ret;
321 }
322 
323 void bch2_alloc_v4_swab(struct bkey_s k)
324 {
325         struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
326         struct bch_backpointer *bp, *bps;
327 
328         a->journal_seq          = swab64(a->journal_seq);
329         a->flags                = swab32(a->flags);
330         a->dirty_sectors        = swab32(a->dirty_sectors);
331         a->cached_sectors       = swab32(a->cached_sectors);
332         a->io_time[0]           = swab64(a->io_time[0]);
333         a->io_time[1]           = swab64(a->io_time[1]);
334         a->stripe               = swab32(a->stripe);
335         a->nr_external_backpointers = swab32(a->nr_external_backpointers);
336         a->fragmentation_lru    = swab64(a->fragmentation_lru);
337         a->stripe_sectors       = swab32(a->stripe_sectors);
338 
339         bps = alloc_v4_backpointers(a);
340         for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
341                 bp->bucket_offset       = swab40(bp->bucket_offset);
342                 bp->bucket_len          = swab32(bp->bucket_len);
343                 bch2_bpos_swab(&bp->pos);
344         }
345 }
346 
347 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
348 {
349         struct bch_alloc_v4 _a;
350         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
351 
352         prt_newline(out);
353         printbuf_indent_add(out, 2);
354 
355         prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
356         bch2_prt_data_type(out, a->data_type);
357         prt_newline(out);
358         prt_printf(out, "journal_seq       %llu\n",     a->journal_seq);
359         prt_printf(out, "need_discard      %llu\n",     BCH_ALLOC_V4_NEED_DISCARD(a));
360         prt_printf(out, "need_inc_gen      %llu\n",     BCH_ALLOC_V4_NEED_INC_GEN(a));
361         prt_printf(out, "dirty_sectors     %u\n",       a->dirty_sectors);
362         prt_printf(out, "stripe_sectors    %u\n",       a->stripe_sectors);
363         prt_printf(out, "cached_sectors    %u\n",       a->cached_sectors);
364         prt_printf(out, "stripe            %u\n",       a->stripe);
365         prt_printf(out, "stripe_redundancy %u\n",       a->stripe_redundancy);
366         prt_printf(out, "io_time[READ]     %llu\n",     a->io_time[READ]);
367         prt_printf(out, "io_time[WRITE]    %llu\n",     a->io_time[WRITE]);
368         prt_printf(out, "fragmentation     %llu\n",     a->fragmentation_lru);
369         prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
370         printbuf_indent_sub(out, 2);
371 }
372 
373 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
374 {
375         if (k.k->type == KEY_TYPE_alloc_v4) {
376                 void *src, *dst;
377 
378                 *out = *bkey_s_c_to_alloc_v4(k).v;
379 
380                 src = alloc_v4_backpointers(out);
381                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
382                 dst = alloc_v4_backpointers(out);
383 
384                 if (src < dst)
385                         memset(src, 0, dst - src);
386 
387                 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
388         } else {
389                 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
390 
391                 *out = (struct bch_alloc_v4) {
392                         .journal_seq            = u.journal_seq,
393                         .flags                  = u.need_discard,
394                         .gen                    = u.gen,
395                         .oldest_gen             = u.oldest_gen,
396                         .data_type              = u.data_type,
397                         .stripe_redundancy      = u.stripe_redundancy,
398                         .dirty_sectors          = u.dirty_sectors,
399                         .cached_sectors         = u.cached_sectors,
400                         .io_time[READ]          = u.read_time,
401                         .io_time[WRITE]         = u.write_time,
402                         .stripe                 = u.stripe,
403                 };
404 
405                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
406         }
407 }
408 
409 static noinline struct bkey_i_alloc_v4 *
410 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
411 {
412         struct bkey_i_alloc_v4 *ret;
413 
414         ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
415         if (IS_ERR(ret))
416                 return ret;
417 
418         if (k.k->type == KEY_TYPE_alloc_v4) {
419                 void *src, *dst;
420 
421                 bkey_reassemble(&ret->k_i, k);
422 
423                 src = alloc_v4_backpointers(&ret->v);
424                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
425                 dst = alloc_v4_backpointers(&ret->v);
426 
427                 if (src < dst)
428                         memset(src, 0, dst - src);
429 
430                 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
431                 set_alloc_v4_u64s(ret);
432         } else {
433                 bkey_alloc_v4_init(&ret->k_i);
434                 ret->k.p = k.k->p;
435                 bch2_alloc_to_v4(k, &ret->v);
436         }
437         return ret;
438 }
439 
440 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
441 {
442         struct bkey_s_c_alloc_v4 a;
443 
444         if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
445             ((a = bkey_s_c_to_alloc_v4(k), true) &&
446              BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
447                 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
448 
449         return __bch2_alloc_to_v4_mut(trans, k);
450 }
451 
452 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
453 {
454         return bch2_alloc_to_v4_mut_inlined(trans, k);
455 }
456 
457 struct bkey_i_alloc_v4 *
458 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
459                                        struct bpos pos)
460 {
461         struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
462                                                BTREE_ITER_with_updates|
463                                                BTREE_ITER_cached|
464                                                BTREE_ITER_intent);
465         int ret = bkey_err(k);
466         if (unlikely(ret))
467                 return ERR_PTR(ret);
468 
469         struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
470         ret = PTR_ERR_OR_ZERO(a);
471         if (unlikely(ret))
472                 goto err;
473         return a;
474 err:
475         bch2_trans_iter_exit(trans, iter);
476         return ERR_PTR(ret);
477 }
478 
479 __flatten
480 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
481                                                       enum btree_iter_update_trigger_flags flags)
482 {
483         struct btree_iter iter;
484         struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
485         int ret = PTR_ERR_OR_ZERO(a);
486         if (ret)
487                 return ERR_PTR(ret);
488 
489         ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
490         bch2_trans_iter_exit(trans, &iter);
491         return unlikely(ret) ? ERR_PTR(ret) : a;
492 }
493 
494 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
495 {
496         *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
497 
498         pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
499         return pos;
500 }
501 
502 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
503 {
504         pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
505         pos.offset += offset;
506         return pos;
507 }
508 
509 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
510 {
511         return k.k->type == KEY_TYPE_bucket_gens
512                 ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
513                 : 0;
514 }
515 
516 int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
517                              enum bch_validate_flags flags,
518                              struct printbuf *err)
519 {
520         int ret = 0;
521 
522         bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
523                          bucket_gens_val_size_bad,
524                          "bad val size (%zu != %zu)",
525                          bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
526 fsck_err:
527         return ret;
528 }
529 
530 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
531 {
532         struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
533         unsigned i;
534 
535         for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
536                 if (i)
537                         prt_char(out, ' ');
538                 prt_printf(out, "%u", g.v->gens[i]);
539         }
540 }
541 
542 int bch2_bucket_gens_init(struct bch_fs *c)
543 {
544         struct btree_trans *trans = bch2_trans_get(c);
545         struct bkey_i_bucket_gens g;
546         bool have_bucket_gens_key = false;
547         int ret;
548 
549         ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
550                                  BTREE_ITER_prefetch, k, ({
551                 /*
552                  * Not a fsck error because this is checked/repaired by
553                  * bch2_check_alloc_key() which runs later:
554                  */
555                 if (!bch2_dev_bucket_exists(c, k.k->p))
556                         continue;
557 
558                 struct bch_alloc_v4 a;
559                 u8 gen = bch2_alloc_to_v4(k, &a)->gen;
560                 unsigned offset;
561                 struct bpos pos = alloc_gens_pos(iter.pos, &offset);
562                 int ret2 = 0;
563 
564                 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
565                         ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
566                                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
567                         if (ret2)
568                                 goto iter_err;
569                         have_bucket_gens_key = false;
570                 }
571 
572                 if (!have_bucket_gens_key) {
573                         bkey_bucket_gens_init(&g.k_i);
574                         g.k.p = pos;
575                         have_bucket_gens_key = true;
576                 }
577 
578                 g.v.gens[offset] = gen;
579 iter_err:
580                 ret2;
581         }));
582 
583         if (have_bucket_gens_key && !ret)
584                 ret = commit_do(trans, NULL, NULL,
585                                 BCH_TRANS_COMMIT_no_enospc,
586                         bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
587 
588         bch2_trans_put(trans);
589 
590         bch_err_fn(c, ret);
591         return ret;
592 }
593 
594 int bch2_alloc_read(struct bch_fs *c)
595 {
596         struct btree_trans *trans = bch2_trans_get(c);
597         struct bch_dev *ca = NULL;
598         int ret;
599 
600         if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
601                 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
602                                          BTREE_ITER_prefetch, k, ({
603                         u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
604                         u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
605 
606                         if (k.k->type != KEY_TYPE_bucket_gens)
607                                 continue;
608 
609                         ca = bch2_dev_iterate(c, ca, k.k->p.inode);
610                         /*
611                          * Not a fsck error because this is checked/repaired by
612                          * bch2_check_alloc_key() which runs later:
613                          */
614                         if (!ca) {
615                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
616                                 continue;
617                         }
618 
619                         const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
620 
621                         for (u64 b = max_t(u64, ca->mi.first_bucket, start);
622                              b < min_t(u64, ca->mi.nbuckets, end);
623                              b++)
624                                 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
625                         0;
626                 }));
627         } else {
628                 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
629                                          BTREE_ITER_prefetch, k, ({
630                         ca = bch2_dev_iterate(c, ca, k.k->p.inode);
631                         /*
632                          * Not a fsck error because this is checked/repaired by
633                          * bch2_check_alloc_key() which runs later:
634                          */
635                         if (!ca) {
636                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
637                                 continue;
638                         }
639 
640                         struct bch_alloc_v4 a;
641                         *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
642                         0;
643                 }));
644         }
645 
646         bch2_dev_put(ca);
647         bch2_trans_put(trans);
648 
649         bch_err_fn(c, ret);
650         return ret;
651 }
652 
653 /* Free space/discard btree: */
654 
655 static int bch2_bucket_do_index(struct btree_trans *trans,
656                                 struct bch_dev *ca,
657                                 struct bkey_s_c alloc_k,
658                                 const struct bch_alloc_v4 *a,
659                                 bool set)
660 {
661         struct bch_fs *c = trans->c;
662         struct btree_iter iter;
663         struct bkey_s_c old;
664         struct bkey_i *k;
665         enum btree_id btree;
666         enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
667         enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
668         struct printbuf buf = PRINTBUF;
669         int ret;
670 
671         if (a->data_type != BCH_DATA_free &&
672             a->data_type != BCH_DATA_need_discard)
673                 return 0;
674 
675         k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
676         if (IS_ERR(k))
677                 return PTR_ERR(k);
678 
679         bkey_init(&k->k);
680         k->k.type = new_type;
681 
682         switch (a->data_type) {
683         case BCH_DATA_free:
684                 btree = BTREE_ID_freespace;
685                 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
686                 bch2_key_resize(&k->k, 1);
687                 break;
688         case BCH_DATA_need_discard:
689                 btree = BTREE_ID_need_discard;
690                 k->k.p = alloc_k.k->p;
691                 break;
692         default:
693                 return 0;
694         }
695 
696         old = bch2_bkey_get_iter(trans, &iter, btree,
697                              bkey_start_pos(&k->k),
698                              BTREE_ITER_intent);
699         ret = bkey_err(old);
700         if (ret)
701                 return ret;
702 
703         if (ca->mi.freespace_initialized &&
704             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
705             bch2_trans_inconsistent_on(old.k->type != old_type, trans,
706                         "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
707                         "  for %s",
708                         set ? "setting" : "clearing",
709                         bch2_btree_id_str(btree),
710                         iter.pos.inode,
711                         iter.pos.offset,
712                         bch2_bkey_types[old.k->type],
713                         bch2_bkey_types[old_type],
714                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
715                 ret = -EIO;
716                 goto err;
717         }
718 
719         ret = bch2_trans_update(trans, &iter, k, 0);
720 err:
721         bch2_trans_iter_exit(trans, &iter);
722         printbuf_exit(&buf);
723         return ret;
724 }
725 
726 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
727                                            struct bpos bucket, u8 gen)
728 {
729         struct btree_iter iter;
730         unsigned offset;
731         struct bpos pos = alloc_gens_pos(bucket, &offset);
732         struct bkey_i_bucket_gens *g;
733         struct bkey_s_c k;
734         int ret;
735 
736         g = bch2_trans_kmalloc(trans, sizeof(*g));
737         ret = PTR_ERR_OR_ZERO(g);
738         if (ret)
739                 return ret;
740 
741         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
742                                BTREE_ITER_intent|
743                                BTREE_ITER_with_updates);
744         ret = bkey_err(k);
745         if (ret)
746                 return ret;
747 
748         if (k.k->type != KEY_TYPE_bucket_gens) {
749                 bkey_bucket_gens_init(&g->k_i);
750                 g->k.p = iter.pos;
751         } else {
752                 bkey_reassemble(&g->k_i, k);
753         }
754 
755         g->v.gens[offset] = gen;
756 
757         ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
758         bch2_trans_iter_exit(trans, &iter);
759         return ret;
760 }
761 
762 static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
763                                                     enum bch_data_type data_type,
764                                                     s64 delta_buckets,
765                                                     s64 delta_sectors,
766                                                     s64 delta_fragmented, unsigned flags)
767 {
768         struct disk_accounting_pos acc = {
769                 .type = BCH_DISK_ACCOUNTING_dev_data_type,
770                 .dev_data_type.dev              = ca->dev_idx,
771                 .dev_data_type.data_type        = data_type,
772         };
773         s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
774 
775         return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
776 }
777 
778 int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
779                                    const struct bch_alloc_v4 *old,
780                                    const struct bch_alloc_v4 *new,
781                                    unsigned flags)
782 {
783         s64 old_sectors = bch2_bucket_sectors(*old);
784         s64 new_sectors = bch2_bucket_sectors(*new);
785         if (old->data_type != new->data_type) {
786                 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
787                                  1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
788                           bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
789                                 -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
790                 if (ret)
791                         return ret;
792         } else if (old_sectors != new_sectors) {
793                 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
794                                          0,
795                                          new_sectors - old_sectors,
796                                          bch2_bucket_sectors_fragmented(ca, *new) -
797                                          bch2_bucket_sectors_fragmented(ca, *old), flags);
798                 if (ret)
799                         return ret;
800         }
801 
802         s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
803         s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
804         if (old_unstriped != new_unstriped) {
805                 int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
806                                          !!new_unstriped - !!old_unstriped,
807                                          new_unstriped - old_unstriped,
808                                          0,
809                                          flags);
810                 if (ret)
811                         return ret;
812         }
813 
814         return 0;
815 }
816 
817 int bch2_trigger_alloc(struct btree_trans *trans,
818                        enum btree_id btree, unsigned level,
819                        struct bkey_s_c old, struct bkey_s new,
820                        enum btree_iter_update_trigger_flags flags)
821 {
822         struct bch_fs *c = trans->c;
823         struct printbuf buf = PRINTBUF;
824         int ret = 0;
825 
826         struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
827         if (!ca)
828                 return -EIO;
829 
830         struct bch_alloc_v4 old_a_convert;
831         const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
832         struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
833 
834         if (flags & BTREE_TRIGGER_transactional) {
835                 alloc_data_type_set(new_a, new_a->data_type);
836 
837                 if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
838                         new_a->io_time[READ] = bch2_current_io_time(c, READ);
839                         new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
840                         SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
841                         SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
842                 }
843 
844                 if (data_type_is_empty(new_a->data_type) &&
845                     BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
846                     !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
847                         new_a->gen++;
848                         SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
849                         alloc_data_type_set(new_a, new_a->data_type);
850                 }
851 
852                 if (old_a->data_type != new_a->data_type ||
853                     (new_a->data_type == BCH_DATA_free &&
854                      alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
855                         ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
856                                 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
857                         if (ret)
858                                 goto err;
859                 }
860 
861                 if (new_a->data_type == BCH_DATA_cached &&
862                     !new_a->io_time[READ])
863                         new_a->io_time[READ] = bch2_current_io_time(c, READ);
864 
865                 u64 old_lru = alloc_lru_idx_read(*old_a);
866                 u64 new_lru = alloc_lru_idx_read(*new_a);
867                 if (old_lru != new_lru) {
868                         ret = bch2_lru_change(trans, new.k->p.inode,
869                                               bucket_to_u64(new.k->p),
870                                               old_lru, new_lru);
871                         if (ret)
872                                 goto err;
873                 }
874 
875                 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
876                 if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
877                         ret = bch2_lru_change(trans,
878                                         BCH_LRU_FRAGMENTATION_START,
879                                         bucket_to_u64(new.k->p),
880                                         old_a->fragmentation_lru, new_a->fragmentation_lru);
881                         if (ret)
882                                 goto err;
883                 }
884 
885                 if (old_a->gen != new_a->gen) {
886                         ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
887                         if (ret)
888                                 goto err;
889                 }
890 
891                 if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
892                     old_a->cached_sectors) {
893                         ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
894                                          -((s64) old_a->cached_sectors),
895                                          flags & BTREE_TRIGGER_gc);
896                         if (ret)
897                                 goto err;
898                 }
899 
900                 ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
901                 if (ret)
902                         goto err;
903         }
904 
905         if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
906                 u64 journal_seq = trans->journal_res.seq;
907                 u64 bucket_journal_seq = new_a->journal_seq;
908 
909                 if ((flags & BTREE_TRIGGER_insert) &&
910                     data_type_is_empty(old_a->data_type) !=
911                     data_type_is_empty(new_a->data_type) &&
912                     new.k->type == KEY_TYPE_alloc_v4) {
913                         struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
914 
915                         /*
916                          * If the btree updates referring to a bucket weren't flushed
917                          * before the bucket became empty again, then the we don't have
918                          * to wait on a journal flush before we can reuse the bucket:
919                          */
920                         v->journal_seq = bucket_journal_seq =
921                                 data_type_is_empty(new_a->data_type) &&
922                                 (journal_seq == v->journal_seq ||
923                                  bch2_journal_noflush_seq(&c->journal, v->journal_seq))
924                                 ? 0 : journal_seq;
925                 }
926 
927                 if (!data_type_is_empty(old_a->data_type) &&
928                     data_type_is_empty(new_a->data_type) &&
929                     bucket_journal_seq) {
930                         ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
931                                         c->journal.flushed_seq_ondisk,
932                                         new.k->p.inode, new.k->p.offset,
933                                         bucket_journal_seq);
934                         if (bch2_fs_fatal_err_on(ret, c,
935                                         "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
936                                 goto err;
937                 }
938 
939                 if (new_a->gen != old_a->gen) {
940                         rcu_read_lock();
941                         u8 *gen = bucket_gen(ca, new.k->p.offset);
942                         if (unlikely(!gen)) {
943                                 rcu_read_unlock();
944                                 goto invalid_bucket;
945                         }
946                         *gen = new_a->gen;
947                         rcu_read_unlock();
948                 }
949 
950 #define eval_state(_a, expr)            ({ const struct bch_alloc_v4 *a = _a; expr; })
951 #define statechange(expr)               !eval_state(old_a, expr) && eval_state(new_a, expr)
952 #define bucket_flushed(a)               (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
953 
954                 if (statechange(a->data_type == BCH_DATA_free) &&
955                     bucket_flushed(new_a))
956                         closure_wake_up(&c->freelist_wait);
957 
958                 if (statechange(a->data_type == BCH_DATA_need_discard) &&
959                     !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
960                     bucket_flushed(new_a))
961                         bch2_discard_one_bucket_fast(ca, new.k->p.offset);
962 
963                 if (statechange(a->data_type == BCH_DATA_cached) &&
964                     !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
965                     should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
966                         bch2_dev_do_invalidates(ca);
967 
968                 if (statechange(a->data_type == BCH_DATA_need_gc_gens))
969                         bch2_gc_gens_async(c);
970         }
971 
972         if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
973                 rcu_read_lock();
974                 struct bucket *g = gc_bucket(ca, new.k->p.offset);
975                 if (unlikely(!g)) {
976                         rcu_read_unlock();
977                         goto invalid_bucket;
978                 }
979                 g->gen_valid    = 1;
980                 g->gen          = new_a->gen;
981                 rcu_read_unlock();
982         }
983 err:
984         printbuf_exit(&buf);
985         bch2_dev_put(ca);
986         return ret;
987 invalid_bucket:
988         bch2_fs_inconsistent(c, "reference to invalid bucket\n  %s",
989                              (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
990         ret = -EIO;
991         goto err;
992 }
993 
994 /*
995  * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
996  * extents style btrees, but works on non-extents btrees:
997  */
998 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
999 {
1000         struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
1001 
1002         if (bkey_err(k))
1003                 return k;
1004 
1005         if (k.k->type) {
1006                 return k;
1007         } else {
1008                 struct btree_iter iter2;
1009                 struct bpos next;
1010 
1011                 bch2_trans_copy_iter(&iter2, iter);
1012 
1013                 struct btree_path *path = btree_iter_path(iter->trans, iter);
1014                 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
1015                         end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
1016 
1017                 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
1018 
1019                 /*
1020                  * btree node min/max is a closed interval, upto takes a half
1021                  * open interval:
1022                  */
1023                 k = bch2_btree_iter_peek_upto(&iter2, end);
1024                 next = iter2.pos;
1025                 bch2_trans_iter_exit(iter->trans, &iter2);
1026 
1027                 BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
1028 
1029                 if (bkey_err(k))
1030                         return k;
1031 
1032                 bkey_init(hole);
1033                 hole->p = iter->pos;
1034 
1035                 bch2_key_resize(hole, next.offset - iter->pos.offset);
1036                 return (struct bkey_s_c) { hole, NULL };
1037         }
1038 }
1039 
1040 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
1041 {
1042         if (*ca) {
1043                 if (bucket->offset < (*ca)->mi.first_bucket)
1044                         bucket->offset = (*ca)->mi.first_bucket;
1045 
1046                 if (bucket->offset < (*ca)->mi.nbuckets)
1047                         return true;
1048 
1049                 bch2_dev_put(*ca);
1050                 *ca = NULL;
1051                 bucket->inode++;
1052                 bucket->offset = 0;
1053         }
1054 
1055         rcu_read_lock();
1056         *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
1057         if (*ca) {
1058                 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1059                 bch2_dev_get(*ca);
1060         }
1061         rcu_read_unlock();
1062 
1063         return *ca != NULL;
1064 }
1065 
1066 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
1067                                         struct bch_dev **ca, struct bkey *hole)
1068 {
1069         struct bch_fs *c = iter->trans->c;
1070         struct bkey_s_c k;
1071 again:
1072         k = bch2_get_key_or_hole(iter, POS_MAX, hole);
1073         if (bkey_err(k))
1074                 return k;
1075 
1076         *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1077 
1078         if (!k.k->type) {
1079                 struct bpos hole_start = bkey_start_pos(k.k);
1080 
1081                 if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1082                         if (!next_bucket(c, ca, &hole_start))
1083                                 return bkey_s_c_null;
1084 
1085                         bch2_btree_iter_set_pos(iter, hole_start);
1086                         goto again;
1087                 }
1088 
1089                 if (k.k->p.offset > (*ca)->mi.nbuckets)
1090                         bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1091         }
1092 
1093         return k;
1094 }
1095 
1096 static noinline_for_stack
1097 int bch2_check_alloc_key(struct btree_trans *trans,
1098                          struct bkey_s_c alloc_k,
1099                          struct btree_iter *alloc_iter,
1100                          struct btree_iter *discard_iter,
1101                          struct btree_iter *freespace_iter,
1102                          struct btree_iter *bucket_gens_iter)
1103 {
1104         struct bch_fs *c = trans->c;
1105         struct bch_alloc_v4 a_convert;
1106         const struct bch_alloc_v4 *a;
1107         unsigned discard_key_type, freespace_key_type;
1108         unsigned gens_offset;
1109         struct bkey_s_c k;
1110         struct printbuf buf = PRINTBUF;
1111         int ret = 0;
1112 
1113         struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1114         if (fsck_err_on(!ca,
1115                         trans, alloc_key_to_missing_dev_bucket,
1116                         "alloc key for invalid device:bucket %llu:%llu",
1117                         alloc_k.k->p.inode, alloc_k.k->p.offset))
1118                 ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1119         if (!ca)
1120                 return ret;
1121 
1122         if (!ca->mi.freespace_initialized)
1123                 goto out;
1124 
1125         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1126 
1127         discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
1128         bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1129         k = bch2_btree_iter_peek_slot(discard_iter);
1130         ret = bkey_err(k);
1131         if (ret)
1132                 goto err;
1133 
1134         if (fsck_err_on(k.k->type != discard_key_type,
1135                         trans, need_discard_key_wrong,
1136                         "incorrect key in need_discard btree (got %s should be %s)\n"
1137                         "  %s",
1138                         bch2_bkey_types[k.k->type],
1139                         bch2_bkey_types[discard_key_type],
1140                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1141                 struct bkey_i *update =
1142                         bch2_trans_kmalloc(trans, sizeof(*update));
1143 
1144                 ret = PTR_ERR_OR_ZERO(update);
1145                 if (ret)
1146                         goto err;
1147 
1148                 bkey_init(&update->k);
1149                 update->k.type  = discard_key_type;
1150                 update->k.p     = discard_iter->pos;
1151 
1152                 ret = bch2_trans_update(trans, discard_iter, update, 0);
1153                 if (ret)
1154                         goto err;
1155         }
1156 
1157         freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1158         bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1159         k = bch2_btree_iter_peek_slot(freespace_iter);
1160         ret = bkey_err(k);
1161         if (ret)
1162                 goto err;
1163 
1164         if (fsck_err_on(k.k->type != freespace_key_type,
1165                         trans, freespace_key_wrong,
1166                         "incorrect key in freespace btree (got %s should be %s)\n"
1167                         "  %s",
1168                         bch2_bkey_types[k.k->type],
1169                         bch2_bkey_types[freespace_key_type],
1170                         (printbuf_reset(&buf),
1171                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1172                 struct bkey_i *update =
1173                         bch2_trans_kmalloc(trans, sizeof(*update));
1174 
1175                 ret = PTR_ERR_OR_ZERO(update);
1176                 if (ret)
1177                         goto err;
1178 
1179                 bkey_init(&update->k);
1180                 update->k.type  = freespace_key_type;
1181                 update->k.p     = freespace_iter->pos;
1182                 bch2_key_resize(&update->k, 1);
1183 
1184                 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1185                 if (ret)
1186                         goto err;
1187         }
1188 
1189         bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1190         k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1191         ret = bkey_err(k);
1192         if (ret)
1193                 goto err;
1194 
1195         if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1196                         trans, bucket_gens_key_wrong,
1197                         "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1198                         "  %s",
1199                         alloc_gen(k, gens_offset), a->gen,
1200                         (printbuf_reset(&buf),
1201                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1202                 struct bkey_i_bucket_gens *g =
1203                         bch2_trans_kmalloc(trans, sizeof(*g));
1204 
1205                 ret = PTR_ERR_OR_ZERO(g);
1206                 if (ret)
1207                         goto err;
1208 
1209                 if (k.k->type == KEY_TYPE_bucket_gens) {
1210                         bkey_reassemble(&g->k_i, k);
1211                 } else {
1212                         bkey_bucket_gens_init(&g->k_i);
1213                         g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1214                 }
1215 
1216                 g->v.gens[gens_offset] = a->gen;
1217 
1218                 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1219                 if (ret)
1220                         goto err;
1221         }
1222 out:
1223 err:
1224 fsck_err:
1225         bch2_dev_put(ca);
1226         printbuf_exit(&buf);
1227         return ret;
1228 }
1229 
1230 static noinline_for_stack
1231 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1232                                     struct bch_dev *ca,
1233                                     struct bpos start,
1234                                     struct bpos *end,
1235                                     struct btree_iter *freespace_iter)
1236 {
1237         struct bkey_s_c k;
1238         struct printbuf buf = PRINTBUF;
1239         int ret;
1240 
1241         if (!ca->mi.freespace_initialized)
1242                 return 0;
1243 
1244         bch2_btree_iter_set_pos(freespace_iter, start);
1245 
1246         k = bch2_btree_iter_peek_slot(freespace_iter);
1247         ret = bkey_err(k);
1248         if (ret)
1249                 goto err;
1250 
1251         *end = bkey_min(k.k->p, *end);
1252 
1253         if (fsck_err_on(k.k->type != KEY_TYPE_set,
1254                         trans, freespace_hole_missing,
1255                         "hole in alloc btree missing in freespace btree\n"
1256                         "  device %llu buckets %llu-%llu",
1257                         freespace_iter->pos.inode,
1258                         freespace_iter->pos.offset,
1259                         end->offset)) {
1260                 struct bkey_i *update =
1261                         bch2_trans_kmalloc(trans, sizeof(*update));
1262 
1263                 ret = PTR_ERR_OR_ZERO(update);
1264                 if (ret)
1265                         goto err;
1266 
1267                 bkey_init(&update->k);
1268                 update->k.type  = KEY_TYPE_set;
1269                 update->k.p     = freespace_iter->pos;
1270                 bch2_key_resize(&update->k,
1271                                 min_t(u64, U32_MAX, end->offset -
1272                                       freespace_iter->pos.offset));
1273 
1274                 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1275                 if (ret)
1276                         goto err;
1277         }
1278 err:
1279 fsck_err:
1280         printbuf_exit(&buf);
1281         return ret;
1282 }
1283 
1284 static noinline_for_stack
1285 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1286                                       struct bpos start,
1287                                       struct bpos *end,
1288                                       struct btree_iter *bucket_gens_iter)
1289 {
1290         struct bkey_s_c k;
1291         struct printbuf buf = PRINTBUF;
1292         unsigned i, gens_offset, gens_end_offset;
1293         int ret;
1294 
1295         bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1296 
1297         k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1298         ret = bkey_err(k);
1299         if (ret)
1300                 goto err;
1301 
1302         if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1303                      alloc_gens_pos(*end,  &gens_end_offset)))
1304                 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1305 
1306         if (k.k->type == KEY_TYPE_bucket_gens) {
1307                 struct bkey_i_bucket_gens g;
1308                 bool need_update = false;
1309 
1310                 bkey_reassemble(&g.k_i, k);
1311 
1312                 for (i = gens_offset; i < gens_end_offset; i++) {
1313                         if (fsck_err_on(g.v.gens[i], trans,
1314                                         bucket_gens_hole_wrong,
1315                                         "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1316                                         bucket_gens_pos_to_alloc(k.k->p, i).inode,
1317                                         bucket_gens_pos_to_alloc(k.k->p, i).offset,
1318                                         g.v.gens[i])) {
1319                                 g.v.gens[i] = 0;
1320                                 need_update = true;
1321                         }
1322                 }
1323 
1324                 if (need_update) {
1325                         struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1326 
1327                         ret = PTR_ERR_OR_ZERO(u);
1328                         if (ret)
1329                                 goto err;
1330 
1331                         memcpy(u, &g, sizeof(g));
1332 
1333                         ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1334                         if (ret)
1335                                 goto err;
1336                 }
1337         }
1338 
1339         *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1340 err:
1341 fsck_err:
1342         printbuf_exit(&buf);
1343         return ret;
1344 }
1345 
1346 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
1347                                               struct btree_iter *iter)
1348 {
1349         struct bch_fs *c = trans->c;
1350         struct btree_iter alloc_iter;
1351         struct bkey_s_c alloc_k;
1352         struct bch_alloc_v4 a_convert;
1353         const struct bch_alloc_v4 *a;
1354         u64 genbits;
1355         struct bpos pos;
1356         enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1357                 ? BCH_DATA_need_discard
1358                 : BCH_DATA_free;
1359         struct printbuf buf = PRINTBUF;
1360         int ret;
1361 
1362         pos = iter->pos;
1363         pos.offset &= ~(~0ULL << 56);
1364         genbits = iter->pos.offset & (~0ULL << 56);
1365 
1366         alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1367         ret = bkey_err(alloc_k);
1368         if (ret)
1369                 return ret;
1370 
1371         if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
1372                         trans, need_discard_freespace_key_to_invalid_dev_bucket,
1373                         "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1374                         bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
1375                 goto delete;
1376 
1377         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1378 
1379         if (fsck_err_on(a->data_type != state ||
1380                         (state == BCH_DATA_free &&
1381                          genbits != alloc_freespace_genbits(*a)),
1382                         trans, need_discard_freespace_key_bad,
1383                         "%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1384                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1385                         bch2_btree_id_str(iter->btree_id),
1386                         iter->pos.inode,
1387                         iter->pos.offset,
1388                         a->data_type == state,
1389                         genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1390                 goto delete;
1391 out:
1392 fsck_err:
1393         bch2_set_btree_iter_dontneed(&alloc_iter);
1394         bch2_trans_iter_exit(trans, &alloc_iter);
1395         printbuf_exit(&buf);
1396         return ret;
1397 delete:
1398         ret =   bch2_btree_delete_extent_at(trans, iter,
1399                         iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1400                 bch2_trans_commit(trans, NULL, NULL,
1401                         BCH_TRANS_COMMIT_no_enospc);
1402         goto out;
1403 }
1404 
1405 /*
1406  * We've already checked that generation numbers in the bucket_gens btree are
1407  * valid for buckets that exist; this just checks for keys for nonexistent
1408  * buckets.
1409  */
1410 static noinline_for_stack
1411 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1412                                struct btree_iter *iter,
1413                                struct bkey_s_c k)
1414 {
1415         struct bch_fs *c = trans->c;
1416         struct bkey_i_bucket_gens g;
1417         u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1418         u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1419         u64 b;
1420         bool need_update = false;
1421         struct printbuf buf = PRINTBUF;
1422         int ret = 0;
1423 
1424         BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1425         bkey_reassemble(&g.k_i, k);
1426 
1427         struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1428         if (!ca) {
1429                 if (fsck_err(trans, bucket_gens_to_invalid_dev,
1430                              "bucket_gens key for invalid device:\n  %s",
1431                              (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1432                         ret = bch2_btree_delete_at(trans, iter, 0);
1433                 goto out;
1434         }
1435 
1436         if (fsck_err_on(end <= ca->mi.first_bucket ||
1437                         start >= ca->mi.nbuckets,
1438                         trans, bucket_gens_to_invalid_buckets,
1439                         "bucket_gens key for invalid buckets:\n  %s",
1440                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1441                 ret = bch2_btree_delete_at(trans, iter, 0);
1442                 goto out;
1443         }
1444 
1445         for (b = start; b < ca->mi.first_bucket; b++)
1446                 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1447                                 trans, bucket_gens_nonzero_for_invalid_buckets,
1448                                 "bucket_gens key has nonzero gen for invalid bucket")) {
1449                         g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1450                         need_update = true;
1451                 }
1452 
1453         for (b = ca->mi.nbuckets; b < end; b++)
1454                 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1455                                 trans, bucket_gens_nonzero_for_invalid_buckets,
1456                                 "bucket_gens key has nonzero gen for invalid bucket")) {
1457                         g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1458                         need_update = true;
1459                 }
1460 
1461         if (need_update) {
1462                 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1463 
1464                 ret = PTR_ERR_OR_ZERO(u);
1465                 if (ret)
1466                         goto out;
1467 
1468                 memcpy(u, &g, sizeof(g));
1469                 ret = bch2_trans_update(trans, iter, u, 0);
1470         }
1471 out:
1472 fsck_err:
1473         bch2_dev_put(ca);
1474         printbuf_exit(&buf);
1475         return ret;
1476 }
1477 
1478 int bch2_check_alloc_info(struct bch_fs *c)
1479 {
1480         struct btree_trans *trans = bch2_trans_get(c);
1481         struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1482         struct bch_dev *ca = NULL;
1483         struct bkey hole;
1484         struct bkey_s_c k;
1485         int ret = 0;
1486 
1487         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1488                              BTREE_ITER_prefetch);
1489         bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1490                              BTREE_ITER_prefetch);
1491         bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1492                              BTREE_ITER_prefetch);
1493         bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1494                              BTREE_ITER_prefetch);
1495 
1496         while (1) {
1497                 struct bpos next;
1498 
1499                 bch2_trans_begin(trans);
1500 
1501                 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
1502                 ret = bkey_err(k);
1503                 if (ret)
1504                         goto bkey_err;
1505 
1506                 if (!k.k)
1507                         break;
1508 
1509                 if (k.k->type) {
1510                         next = bpos_nosnap_successor(k.k->p);
1511 
1512                         ret = bch2_check_alloc_key(trans,
1513                                                    k, &iter,
1514                                                    &discard_iter,
1515                                                    &freespace_iter,
1516                                                    &bucket_gens_iter);
1517                         if (ret)
1518                                 goto bkey_err;
1519                 } else {
1520                         next = k.k->p;
1521 
1522                         ret = bch2_check_alloc_hole_freespace(trans, ca,
1523                                                     bkey_start_pos(k.k),
1524                                                     &next,
1525                                                     &freespace_iter) ?:
1526                                 bch2_check_alloc_hole_bucket_gens(trans,
1527                                                     bkey_start_pos(k.k),
1528                                                     &next,
1529                                                     &bucket_gens_iter);
1530                         if (ret)
1531                                 goto bkey_err;
1532                 }
1533 
1534                 ret = bch2_trans_commit(trans, NULL, NULL,
1535                                         BCH_TRANS_COMMIT_no_enospc);
1536                 if (ret)
1537                         goto bkey_err;
1538 
1539                 bch2_btree_iter_set_pos(&iter, next);
1540 bkey_err:
1541                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1542                         continue;
1543                 if (ret)
1544                         break;
1545         }
1546         bch2_trans_iter_exit(trans, &bucket_gens_iter);
1547         bch2_trans_iter_exit(trans, &freespace_iter);
1548         bch2_trans_iter_exit(trans, &discard_iter);
1549         bch2_trans_iter_exit(trans, &iter);
1550         bch2_dev_put(ca);
1551         ca = NULL;
1552 
1553         if (ret < 0)
1554                 goto err;
1555 
1556         ret = for_each_btree_key(trans, iter,
1557                         BTREE_ID_need_discard, POS_MIN,
1558                         BTREE_ITER_prefetch, k,
1559                 bch2_check_discard_freespace_key(trans, &iter));
1560         if (ret)
1561                 goto err;
1562 
1563         bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1564                              BTREE_ITER_prefetch);
1565         while (1) {
1566                 bch2_trans_begin(trans);
1567                 k = bch2_btree_iter_peek(&iter);
1568                 if (!k.k)
1569                         break;
1570 
1571                 ret = bkey_err(k) ?:
1572                         bch2_check_discard_freespace_key(trans, &iter);
1573                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1574                         ret = 0;
1575                         continue;
1576                 }
1577                 if (ret) {
1578                         struct printbuf buf = PRINTBUF;
1579                         bch2_bkey_val_to_text(&buf, c, k);
1580 
1581                         bch_err(c, "while checking %s", buf.buf);
1582                         printbuf_exit(&buf);
1583                         break;
1584                 }
1585 
1586                 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
1587         }
1588         bch2_trans_iter_exit(trans, &iter);
1589         if (ret)
1590                 goto err;
1591 
1592         ret = for_each_btree_key_commit(trans, iter,
1593                         BTREE_ID_bucket_gens, POS_MIN,
1594                         BTREE_ITER_prefetch, k,
1595                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1596                 bch2_check_bucket_gens_key(trans, &iter, k));
1597 err:
1598         bch2_trans_put(trans);
1599         bch_err_fn(c, ret);
1600         return ret;
1601 }
1602 
1603 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1604                                        struct btree_iter *alloc_iter,
1605                                        struct bkey_buf *last_flushed)
1606 {
1607         struct bch_fs *c = trans->c;
1608         struct bch_alloc_v4 a_convert;
1609         const struct bch_alloc_v4 *a;
1610         struct bkey_s_c alloc_k;
1611         struct printbuf buf = PRINTBUF;
1612         int ret;
1613 
1614         alloc_k = bch2_btree_iter_peek(alloc_iter);
1615         if (!alloc_k.k)
1616                 return 0;
1617 
1618         ret = bkey_err(alloc_k);
1619         if (ret)
1620                 return ret;
1621 
1622         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1623 
1624         if (a->fragmentation_lru) {
1625                 ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
1626                                          a->fragmentation_lru,
1627                                          alloc_k, last_flushed);
1628                 if (ret)
1629                         return ret;
1630         }
1631 
1632         if (a->data_type != BCH_DATA_cached)
1633                 return 0;
1634 
1635         if (fsck_err_on(!a->io_time[READ],
1636                         trans, alloc_key_cached_but_read_time_zero,
1637                         "cached bucket with read_time 0\n"
1638                         "  %s",
1639                 (printbuf_reset(&buf),
1640                  bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1641                 struct bkey_i_alloc_v4 *a_mut =
1642                         bch2_alloc_to_v4_mut(trans, alloc_k);
1643                 ret = PTR_ERR_OR_ZERO(a_mut);
1644                 if (ret)
1645                         goto err;
1646 
1647                 a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
1648                 ret = bch2_trans_update(trans, alloc_iter,
1649                                         &a_mut->k_i, BTREE_TRIGGER_norun);
1650                 if (ret)
1651                         goto err;
1652 
1653                 a = &a_mut->v;
1654         }
1655 
1656         ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
1657                                  alloc_k, last_flushed);
1658         if (ret)
1659                 goto err;
1660 err:
1661 fsck_err:
1662         printbuf_exit(&buf);
1663         return ret;
1664 }
1665 
1666 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1667 {
1668         struct bkey_buf last_flushed;
1669 
1670         bch2_bkey_buf_init(&last_flushed);
1671         bkey_init(&last_flushed.k->k);
1672 
1673         int ret = bch2_trans_run(c,
1674                 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1675                                 POS_MIN, BTREE_ITER_prefetch, k,
1676                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1677                         bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
1678 
1679         bch2_bkey_buf_exit(&last_flushed, c);
1680         bch_err_fn(c, ret);
1681         return ret;
1682 }
1683 
1684 static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
1685 {
1686         int ret;
1687 
1688         mutex_lock(&ca->discard_buckets_in_flight_lock);
1689         darray_for_each(ca->discard_buckets_in_flight, i)
1690                 if (i->bucket == bucket) {
1691                         ret = -BCH_ERR_EEXIST_discard_in_flight_add;
1692                         goto out;
1693                 }
1694 
1695         ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
1696                            .in_progress = in_progress,
1697                            .bucket      = bucket,
1698         }));
1699 out:
1700         mutex_unlock(&ca->discard_buckets_in_flight_lock);
1701         return ret;
1702 }
1703 
1704 static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
1705 {
1706         mutex_lock(&ca->discard_buckets_in_flight_lock);
1707         darray_for_each(ca->discard_buckets_in_flight, i)
1708                 if (i->bucket == bucket) {
1709                         BUG_ON(!i->in_progress);
1710                         darray_remove_item(&ca->discard_buckets_in_flight, i);
1711                         goto found;
1712                 }
1713         BUG();
1714 found:
1715         mutex_unlock(&ca->discard_buckets_in_flight_lock);
1716 }
1717 
1718 struct discard_buckets_state {
1719         u64             seen;
1720         u64             open;
1721         u64             need_journal_commit;
1722         u64             discarded;
1723         u64             need_journal_commit_this_dev;
1724 };
1725 
1726 static int bch2_discard_one_bucket(struct btree_trans *trans,
1727                                    struct bch_dev *ca,
1728                                    struct btree_iter *need_discard_iter,
1729                                    struct bpos *discard_pos_done,
1730                                    struct discard_buckets_state *s)
1731 {
1732         struct bch_fs *c = trans->c;
1733         struct bpos pos = need_discard_iter->pos;
1734         struct btree_iter iter = { NULL };
1735         struct bkey_s_c k;
1736         struct bkey_i_alloc_v4 *a;
1737         struct printbuf buf = PRINTBUF;
1738         bool discard_locked = false;
1739         int ret = 0;
1740 
1741         if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1742                 s->open++;
1743                 goto out;
1744         }
1745 
1746         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1747                         c->journal.flushed_seq_ondisk,
1748                         pos.inode, pos.offset)) {
1749                 s->need_journal_commit++;
1750                 s->need_journal_commit_this_dev++;
1751                 goto out;
1752         }
1753 
1754         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1755                                need_discard_iter->pos,
1756                                BTREE_ITER_cached);
1757         ret = bkey_err(k);
1758         if (ret)
1759                 goto out;
1760 
1761         a = bch2_alloc_to_v4_mut(trans, k);
1762         ret = PTR_ERR_OR_ZERO(a);
1763         if (ret)
1764                 goto out;
1765 
1766         if (bch2_bucket_sectors_total(a->v)) {
1767                 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1768                                                trans, "attempting to discard bucket with dirty data\n%s",
1769                                                (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1770                         ret = -EIO;
1771                 goto out;
1772         }
1773 
1774         if (a->v.data_type != BCH_DATA_need_discard) {
1775                 if (data_type_is_empty(a->v.data_type) &&
1776                     BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1777                         a->v.gen++;
1778                         SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1779                         goto write;
1780                 }
1781 
1782                 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1783                                                trans, "bucket incorrectly set in need_discard btree\n"
1784                                                "%s",
1785                                                (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1786                         ret = -EIO;
1787                 goto out;
1788         }
1789 
1790         if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1791                 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1792                                                trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1793                                                a->v.journal_seq,
1794                                                c->journal.flushed_seq_ondisk,
1795                                                (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1796                         ret = -EIO;
1797                 goto out;
1798         }
1799 
1800         if (discard_in_flight_add(ca, iter.pos.offset, true))
1801                 goto out;
1802 
1803         discard_locked = true;
1804 
1805         if (!bkey_eq(*discard_pos_done, iter.pos) &&
1806             ca->mi.discard && !c->opts.nochanges) {
1807                 /*
1808                  * This works without any other locks because this is the only
1809                  * thread that removes items from the need_discard tree
1810                  */
1811                 bch2_trans_unlock_long(trans);
1812                 blkdev_issue_discard(ca->disk_sb.bdev,
1813                                      k.k->p.offset * ca->mi.bucket_size,
1814                                      ca->mi.bucket_size,
1815                                      GFP_KERNEL);
1816                 *discard_pos_done = iter.pos;
1817 
1818                 ret = bch2_trans_relock_notrace(trans);
1819                 if (ret)
1820                         goto out;
1821         }
1822 
1823         SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1824 write:
1825         alloc_data_type_set(&a->v, a->v.data_type);
1826 
1827         ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1828                 bch2_trans_commit(trans, NULL, NULL,
1829                                   BCH_WATERMARK_btree|
1830                                   BCH_TRANS_COMMIT_no_enospc);
1831         if (ret)
1832                 goto out;
1833 
1834         count_event(c, bucket_discard);
1835         s->discarded++;
1836 out:
1837         if (discard_locked)
1838                 discard_in_flight_remove(ca, iter.pos.offset);
1839         s->seen++;
1840         bch2_trans_iter_exit(trans, &iter);
1841         printbuf_exit(&buf);
1842         return ret;
1843 }
1844 
1845 static void bch2_do_discards_work(struct work_struct *work)
1846 {
1847         struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
1848         struct bch_fs *c = ca->fs;
1849         struct discard_buckets_state s = {};
1850         struct bpos discard_pos_done = POS_MAX;
1851         int ret;
1852 
1853         /*
1854          * We're doing the commit in bch2_discard_one_bucket instead of using
1855          * for_each_btree_key_commit() so that we can increment counters after
1856          * successful commit:
1857          */
1858         ret = bch2_trans_run(c,
1859                 for_each_btree_key_upto(trans, iter,
1860                                    BTREE_ID_need_discard,
1861                                    POS(ca->dev_idx, 0),
1862                                    POS(ca->dev_idx, U64_MAX), 0, k,
1863                         bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
1864 
1865         trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1866                               bch2_err_str(ret));
1867 
1868         bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1869         percpu_ref_put(&ca->io_ref);
1870 }
1871 
1872 void bch2_dev_do_discards(struct bch_dev *ca)
1873 {
1874         struct bch_fs *c = ca->fs;
1875 
1876         if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1877                 return;
1878 
1879         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
1880                 goto put_ioref;
1881 
1882         if (queue_work(c->write_ref_wq, &ca->discard_work))
1883                 return;
1884 
1885         bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1886 put_ioref:
1887         percpu_ref_put(&ca->io_ref);
1888 }
1889 
1890 void bch2_do_discards(struct bch_fs *c)
1891 {
1892         for_each_member_device(c, ca)
1893                 bch2_dev_do_discards(ca);
1894 }
1895 
1896 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
1897 {
1898         struct btree_iter iter;
1899         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
1900         struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
1901         int ret = bkey_err(k);
1902         if (ret)
1903                 goto err;
1904 
1905         struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
1906         ret = PTR_ERR_OR_ZERO(a);
1907         if (ret)
1908                 goto err;
1909 
1910         BUG_ON(a->v.dirty_sectors);
1911         SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1912         alloc_data_type_set(&a->v, a->v.data_type);
1913 
1914         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1915 err:
1916         bch2_trans_iter_exit(trans, &iter);
1917         return ret;
1918 }
1919 
1920 static void bch2_do_discards_fast_work(struct work_struct *work)
1921 {
1922         struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
1923         struct bch_fs *c = ca->fs;
1924 
1925         while (1) {
1926                 bool got_bucket = false;
1927                 u64 bucket;
1928 
1929                 mutex_lock(&ca->discard_buckets_in_flight_lock);
1930                 darray_for_each(ca->discard_buckets_in_flight, i) {
1931                         if (i->in_progress)
1932                                 continue;
1933 
1934                         got_bucket = true;
1935                         bucket = i->bucket;
1936                         i->in_progress = true;
1937                         break;
1938                 }
1939                 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1940 
1941                 if (!got_bucket)
1942                         break;
1943 
1944                 if (ca->mi.discard && !c->opts.nochanges)
1945                         blkdev_issue_discard(ca->disk_sb.bdev,
1946                                              bucket_to_sector(ca, bucket),
1947                                              ca->mi.bucket_size,
1948                                              GFP_KERNEL);
1949 
1950                 int ret = bch2_trans_do(c, NULL, NULL,
1951                         BCH_WATERMARK_btree|
1952                         BCH_TRANS_COMMIT_no_enospc,
1953                         bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
1954                 bch_err_fn(c, ret);
1955 
1956                 discard_in_flight_remove(ca, bucket);
1957 
1958                 if (ret)
1959                         break;
1960         }
1961 
1962         bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1963         percpu_ref_put(&ca->io_ref);
1964 }
1965 
1966 static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
1967 {
1968         struct bch_fs *c = ca->fs;
1969 
1970         if (discard_in_flight_add(ca, bucket, false))
1971                 return;
1972 
1973         if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1974                 return;
1975 
1976         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
1977                 goto put_ioref;
1978 
1979         if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
1980                 return;
1981 
1982         bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1983 put_ioref:
1984         percpu_ref_put(&ca->io_ref);
1985 }
1986 
1987 static int invalidate_one_bucket(struct btree_trans *trans,
1988                                  struct btree_iter *lru_iter,
1989                                  struct bkey_s_c lru_k,
1990                                  s64 *nr_to_invalidate)
1991 {
1992         struct bch_fs *c = trans->c;
1993         struct bkey_i_alloc_v4 *a = NULL;
1994         struct printbuf buf = PRINTBUF;
1995         struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
1996         unsigned cached_sectors;
1997         int ret = 0;
1998 
1999         if (*nr_to_invalidate <= 0)
2000                 return 1;
2001 
2002         if (!bch2_dev_bucket_exists(c, bucket)) {
2003                 prt_str(&buf, "lru entry points to invalid bucket");
2004                 goto err;
2005         }
2006 
2007         if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
2008                 return 0;
2009 
2010         a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
2011         ret = PTR_ERR_OR_ZERO(a);
2012         if (ret)
2013                 goto out;
2014 
2015         /* We expect harmless races here due to the btree write buffer: */
2016         if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
2017                 goto out;
2018 
2019         BUG_ON(a->v.data_type != BCH_DATA_cached);
2020         BUG_ON(a->v.dirty_sectors);
2021 
2022         if (!a->v.cached_sectors)
2023                 bch_err(c, "invalidating empty bucket, confused");
2024 
2025         cached_sectors = a->v.cached_sectors;
2026 
2027         SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
2028         a->v.gen++;
2029         a->v.data_type          = 0;
2030         a->v.dirty_sectors      = 0;
2031         a->v.stripe_sectors     = 0;
2032         a->v.cached_sectors     = 0;
2033         a->v.io_time[READ]      = bch2_current_io_time(c, READ);
2034         a->v.io_time[WRITE]     = bch2_current_io_time(c, WRITE);
2035 
2036         ret = bch2_trans_commit(trans, NULL, NULL,
2037                                 BCH_WATERMARK_btree|
2038                                 BCH_TRANS_COMMIT_no_enospc);
2039         if (ret)
2040                 goto out;
2041 
2042         trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
2043         --*nr_to_invalidate;
2044 out:
2045         printbuf_exit(&buf);
2046         return ret;
2047 err:
2048         prt_str(&buf, "\n  lru key: ");
2049         bch2_bkey_val_to_text(&buf, c, lru_k);
2050 
2051         prt_str(&buf, "\n  lru entry: ");
2052         bch2_lru_pos_to_text(&buf, lru_iter->pos);
2053 
2054         prt_str(&buf, "\n  alloc key: ");
2055         if (!a)
2056                 bch2_bpos_to_text(&buf, bucket);
2057         else
2058                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
2059 
2060         bch_err(c, "%s", buf.buf);
2061         if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
2062                 bch2_inconsistent_error(c);
2063                 ret = -EINVAL;
2064         }
2065 
2066         goto out;
2067 }
2068 
2069 static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
2070                                     struct bch_dev *ca, bool *wrapped)
2071 {
2072         struct bkey_s_c k;
2073 again:
2074         k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
2075         if (!k.k && !*wrapped) {
2076                 bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
2077                 *wrapped = true;
2078                 goto again;
2079         }
2080 
2081         return k;
2082 }
2083 
2084 static void bch2_do_invalidates_work(struct work_struct *work)
2085 {
2086         struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
2087         struct bch_fs *c = ca->fs;
2088         struct btree_trans *trans = bch2_trans_get(c);
2089         int ret = 0;
2090 
2091         ret = bch2_btree_write_buffer_tryflush(trans);
2092         if (ret)
2093                 goto err;
2094 
2095         s64 nr_to_invalidate =
2096                 should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2097         struct btree_iter iter;
2098         bool wrapped = false;
2099 
2100         bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
2101                              lru_pos(ca->dev_idx, 0,
2102                                      ((bch2_current_io_time(c, READ) + U32_MAX) &
2103                                       LRU_TIME_MAX)), 0);
2104 
2105         while (true) {
2106                 bch2_trans_begin(trans);
2107 
2108                 struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
2109                 ret = bkey_err(k);
2110                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2111                         continue;
2112                 if (ret)
2113                         break;
2114                 if (!k.k)
2115                         break;
2116 
2117                 ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
2118                 if (ret)
2119                         break;
2120 
2121                 bch2_btree_iter_advance(&iter);
2122         }
2123         bch2_trans_iter_exit(trans, &iter);
2124 err:
2125         bch2_trans_put(trans);
2126         bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2127         percpu_ref_put(&ca->io_ref);
2128 }
2129 
2130 void bch2_dev_do_invalidates(struct bch_dev *ca)
2131 {
2132         struct bch_fs *c = ca->fs;
2133 
2134         if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
2135                 return;
2136 
2137         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
2138                 goto put_ioref;
2139 
2140         if (queue_work(c->write_ref_wq, &ca->invalidate_work))
2141                 return;
2142 
2143         bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2144 put_ioref:
2145         percpu_ref_put(&ca->io_ref);
2146 }
2147 
2148 void bch2_do_invalidates(struct bch_fs *c)
2149 {
2150         for_each_member_device(c, ca)
2151                 bch2_dev_do_invalidates(ca);
2152 }
2153 
2154 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2155                             u64 bucket_start, u64 bucket_end)
2156 {
2157         struct btree_trans *trans = bch2_trans_get(c);
2158         struct btree_iter iter;
2159         struct bkey_s_c k;
2160         struct bkey hole;
2161         struct bpos end = POS(ca->dev_idx, bucket_end);
2162         struct bch_member *m;
2163         unsigned long last_updated = jiffies;
2164         int ret;
2165 
2166         BUG_ON(bucket_start > bucket_end);
2167         BUG_ON(bucket_end > ca->mi.nbuckets);
2168 
2169         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2170                 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2171                 BTREE_ITER_prefetch);
2172         /*
2173          * Scan the alloc btree for every bucket on @ca, and add buckets to the
2174          * freespace/need_discard/need_gc_gens btrees as needed:
2175          */
2176         while (1) {
2177                 if (last_updated + HZ * 10 < jiffies) {
2178                         bch_info(ca, "%s: currently at %llu/%llu",
2179                                  __func__, iter.pos.offset, ca->mi.nbuckets);
2180                         last_updated = jiffies;
2181                 }
2182 
2183                 bch2_trans_begin(trans);
2184 
2185                 if (bkey_ge(iter.pos, end)) {
2186                         ret = 0;
2187                         break;
2188                 }
2189 
2190                 k = bch2_get_key_or_hole(&iter, end, &hole);
2191                 ret = bkey_err(k);
2192                 if (ret)
2193                         goto bkey_err;
2194 
2195                 if (k.k->type) {
2196                         /*
2197                          * We process live keys in the alloc btree one at a
2198                          * time:
2199                          */
2200                         struct bch_alloc_v4 a_convert;
2201                         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2202 
2203                         ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
2204                                 bch2_trans_commit(trans, NULL, NULL,
2205                                                   BCH_TRANS_COMMIT_no_enospc);
2206                         if (ret)
2207                                 goto bkey_err;
2208 
2209                         bch2_btree_iter_advance(&iter);
2210                 } else {
2211                         struct bkey_i *freespace;
2212 
2213                         freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2214                         ret = PTR_ERR_OR_ZERO(freespace);
2215                         if (ret)
2216                                 goto bkey_err;
2217 
2218                         bkey_init(&freespace->k);
2219                         freespace->k.type       = KEY_TYPE_set;
2220                         freespace->k.p          = k.k->p;
2221                         freespace->k.size       = k.k->size;
2222 
2223                         ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2224                                 bch2_trans_commit(trans, NULL, NULL,
2225                                                   BCH_TRANS_COMMIT_no_enospc);
2226                         if (ret)
2227                                 goto bkey_err;
2228 
2229                         bch2_btree_iter_set_pos(&iter, k.k->p);
2230                 }
2231 bkey_err:
2232                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2233                         continue;
2234                 if (ret)
2235                         break;
2236         }
2237 
2238         bch2_trans_iter_exit(trans, &iter);
2239         bch2_trans_put(trans);
2240 
2241         if (ret < 0) {
2242                 bch_err_msg(ca, ret, "initializing free space");
2243                 return ret;
2244         }
2245 
2246         mutex_lock(&c->sb_lock);
2247         m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2248         SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2249         mutex_unlock(&c->sb_lock);
2250 
2251         return 0;
2252 }
2253 
2254 int bch2_fs_freespace_init(struct bch_fs *c)
2255 {
2256         int ret = 0;
2257         bool doing_init = false;
2258 
2259         /*
2260          * We can crash during the device add path, so we need to check this on
2261          * every mount:
2262          */
2263 
2264         for_each_member_device(c, ca) {
2265                 if (ca->mi.freespace_initialized)
2266                         continue;
2267 
2268                 if (!doing_init) {
2269                         bch_info(c, "initializing freespace");
2270                         doing_init = true;
2271                 }
2272 
2273                 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2274                 if (ret) {
2275                         bch2_dev_put(ca);
2276                         bch_err_fn(c, ret);
2277                         return ret;
2278                 }
2279         }
2280 
2281         if (doing_init) {
2282                 mutex_lock(&c->sb_lock);
2283                 bch2_write_super(c);
2284                 mutex_unlock(&c->sb_lock);
2285                 bch_verbose(c, "done initializing freespace");
2286         }
2287 
2288         return 0;
2289 }
2290 
2291 /* Bucket IO clocks: */
2292 
2293 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2294                               size_t bucket_nr, int rw)
2295 {
2296         struct bch_fs *c = trans->c;
2297         struct btree_iter iter;
2298         struct bkey_i_alloc_v4 *a;
2299         u64 now;
2300         int ret = 0;
2301 
2302         if (bch2_trans_relock(trans))
2303                 bch2_trans_begin(trans);
2304 
2305         a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2306         ret = PTR_ERR_OR_ZERO(a);
2307         if (ret)
2308                 return ret;
2309 
2310         now = bch2_current_io_time(c, rw);
2311         if (a->v.io_time[rw] == now)
2312                 goto out;
2313 
2314         a->v.io_time[rw] = now;
2315 
2316         ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2317                 bch2_trans_commit(trans, NULL, NULL, 0);
2318 out:
2319         bch2_trans_iter_exit(trans, &iter);
2320         return ret;
2321 }
2322 
2323 /* Startup/shutdown (ro/rw): */
2324 
2325 void bch2_recalc_capacity(struct bch_fs *c)
2326 {
2327         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2328         unsigned bucket_size_max = 0;
2329         unsigned long ra_pages = 0;
2330 
2331         lockdep_assert_held(&c->state_lock);
2332 
2333         for_each_online_member(c, ca) {
2334                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2335 
2336                 ra_pages += bdi->ra_pages;
2337         }
2338 
2339         bch2_set_ra_pages(c, ra_pages);
2340 
2341         for_each_rw_member(c, ca) {
2342                 u64 dev_reserve = 0;
2343 
2344                 /*
2345                  * We need to reserve buckets (from the number
2346                  * of currently available buckets) against
2347                  * foreground writes so that mainly copygc can
2348                  * make forward progress.
2349                  *
2350                  * We need enough to refill the various reserves
2351                  * from scratch - copygc will use its entire
2352                  * reserve all at once, then run against when
2353                  * its reserve is refilled (from the formerly
2354                  * available buckets).
2355                  *
2356                  * This reserve is just used when considering if
2357                  * allocations for foreground writes must wait -
2358                  * not -ENOSPC calculations.
2359                  */
2360 
2361                 dev_reserve += ca->nr_btree_reserve * 2;
2362                 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2363 
2364                 dev_reserve += 1;       /* btree write point */
2365                 dev_reserve += 1;       /* copygc write point */
2366                 dev_reserve += 1;       /* rebalance write point */
2367 
2368                 dev_reserve *= ca->mi.bucket_size;
2369 
2370                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2371                                              ca->mi.first_bucket);
2372 
2373                 reserved_sectors += dev_reserve * 2;
2374 
2375                 bucket_size_max = max_t(unsigned, bucket_size_max,
2376                                         ca->mi.bucket_size);
2377         }
2378 
2379         gc_reserve = c->opts.gc_reserve_bytes
2380                 ? c->opts.gc_reserve_bytes >> 9
2381                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2382 
2383         reserved_sectors = max(gc_reserve, reserved_sectors);
2384 
2385         reserved_sectors = min(reserved_sectors, capacity);
2386 
2387         c->reserved = reserved_sectors;
2388         c->capacity = capacity - reserved_sectors;
2389 
2390         c->bucket_size_max = bucket_size_max;
2391 
2392         /* Wake up case someone was waiting for buckets */
2393         closure_wake_up(&c->freelist_wait);
2394 }
2395 
2396 u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2397 {
2398         u64 ret = U64_MAX;
2399 
2400         for_each_rw_member(c, ca)
2401                 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2402         return ret;
2403 }
2404 
2405 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2406 {
2407         struct open_bucket *ob;
2408         bool ret = false;
2409 
2410         for (ob = c->open_buckets;
2411              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2412              ob++) {
2413                 spin_lock(&ob->lock);
2414                 if (ob->valid && !ob->on_partial_list &&
2415                     ob->dev == ca->dev_idx)
2416                         ret = true;
2417                 spin_unlock(&ob->lock);
2418         }
2419 
2420         return ret;
2421 }
2422 
2423 /* device goes ro: */
2424 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2425 {
2426         unsigned i;
2427 
2428         /* First, remove device from allocation groups: */
2429 
2430         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2431                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
2432 
2433         /*
2434          * Capacity is calculated based off of devices in allocation groups:
2435          */
2436         bch2_recalc_capacity(c);
2437 
2438         bch2_open_buckets_stop(c, ca, false);
2439 
2440         /*
2441          * Wake up threads that were blocked on allocation, so they can notice
2442          * the device can no longer be removed and the capacity has changed:
2443          */
2444         closure_wake_up(&c->freelist_wait);
2445 
2446         /*
2447          * journal_res_get() can block waiting for free space in the journal -
2448          * it needs to notice there may not be devices to allocate from anymore:
2449          */
2450         wake_up(&c->journal.wait);
2451 
2452         /* Now wait for any in flight writes: */
2453 
2454         closure_wait_event(&c->open_buckets_wait,
2455                            !bch2_dev_has_open_write_point(c, ca));
2456 }
2457 
2458 /* device goes rw: */
2459 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2460 {
2461         unsigned i;
2462 
2463         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2464                 if (ca->mi.data_allowed & (1 << i))
2465                         set_bit(ca->dev_idx, c->rw_devs[i].d);
2466 }
2467 
2468 void bch2_dev_allocator_background_exit(struct bch_dev *ca)
2469 {
2470         darray_exit(&ca->discard_buckets_in_flight);
2471 }
2472 
2473 void bch2_dev_allocator_background_init(struct bch_dev *ca)
2474 {
2475         mutex_init(&ca->discard_buckets_in_flight_lock);
2476         INIT_WORK(&ca->discard_work, bch2_do_discards_work);
2477         INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
2478         INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
2479 }
2480 
2481 void bch2_fs_allocator_background_init(struct bch_fs *c)
2482 {
2483         spin_lock_init(&c->freelist_lock);
2484 }
2485 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php