~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/ec.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 /* erasure coding */
  4 
  5 #include "bcachefs.h"
  6 #include "alloc_background.h"
  7 #include "alloc_foreground.h"
  8 #include "backpointers.h"
  9 #include "bkey_buf.h"
 10 #include "bset.h"
 11 #include "btree_gc.h"
 12 #include "btree_update.h"
 13 #include "btree_write_buffer.h"
 14 #include "buckets.h"
 15 #include "checksum.h"
 16 #include "disk_accounting.h"
 17 #include "disk_groups.h"
 18 #include "ec.h"
 19 #include "error.h"
 20 #include "io_read.h"
 21 #include "keylist.h"
 22 #include "recovery.h"
 23 #include "replicas.h"
 24 #include "super-io.h"
 25 #include "util.h"
 26 
 27 #include <linux/sort.h>
 28 
 29 #ifdef __KERNEL__
 30 
 31 #include <linux/raid/pq.h>
 32 #include <linux/raid/xor.h>
 33 
 34 static void raid5_recov(unsigned disks, unsigned failed_idx,
 35                         size_t size, void **data)
 36 {
 37         unsigned i = 2, nr;
 38 
 39         BUG_ON(failed_idx >= disks);
 40 
 41         swap(data[0], data[failed_idx]);
 42         memcpy(data[0], data[1], size);
 43 
 44         while (i < disks) {
 45                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
 46                 xor_blocks(nr, size, data[0], data + i);
 47                 i += nr;
 48         }
 49 
 50         swap(data[0], data[failed_idx]);
 51 }
 52 
 53 static void raid_gen(int nd, int np, size_t size, void **v)
 54 {
 55         if (np >= 1)
 56                 raid5_recov(nd + np, nd, size, v);
 57         if (np >= 2)
 58                 raid6_call.gen_syndrome(nd + np, size, v);
 59         BUG_ON(np > 2);
 60 }
 61 
 62 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
 63 {
 64         switch (nr) {
 65         case 0:
 66                 break;
 67         case 1:
 68                 if (ir[0] < nd + 1)
 69                         raid5_recov(nd + 1, ir[0], size, v);
 70                 else
 71                         raid6_call.gen_syndrome(nd + np, size, v);
 72                 break;
 73         case 2:
 74                 if (ir[1] < nd) {
 75                         /* data+data failure. */
 76                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
 77                 } else if (ir[0] < nd) {
 78                         /* data + p/q failure */
 79 
 80                         if (ir[1] == nd) /* data + p failure */
 81                                 raid6_datap_recov(nd + np, size, ir[0], v);
 82                         else { /* data + q failure */
 83                                 raid5_recov(nd + 1, ir[0], size, v);
 84                                 raid6_call.gen_syndrome(nd + np, size, v);
 85                         }
 86                 } else {
 87                         raid_gen(nd, np, size, v);
 88                 }
 89                 break;
 90         default:
 91                 BUG();
 92         }
 93 }
 94 
 95 #else
 96 
 97 #include <raid/raid.h>
 98 
 99 #endif
100 
101 struct ec_bio {
102         struct bch_dev          *ca;
103         struct ec_stripe_buf    *buf;
104         size_t                  idx;
105         struct bio              bio;
106 };
107 
108 /* Stripes btree keys: */
109 
110 int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
111                          enum bch_validate_flags flags)
112 {
113         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
114         int ret = 0;
115 
116         bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
117                          bpos_gt(k.k->p, POS(0, U32_MAX)),
118                          c, stripe_pos_bad,
119                          "stripe at bad pos");
120 
121         bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
122                          c, stripe_val_size_bad,
123                          "incorrect value size (%zu < %u)",
124                          bkey_val_u64s(k.k), stripe_val_u64s(s));
125 
126         ret = bch2_bkey_ptrs_validate(c, k, flags);
127 fsck_err:
128         return ret;
129 }
130 
131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
132                          struct bkey_s_c k)
133 {
134         const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
135         struct bch_stripe s = {};
136 
137         memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
138 
139         unsigned nr_data = s.nr_blocks - s.nr_redundant;
140 
141         prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
142                    s.algorithm,
143                    le16_to_cpu(s.sectors),
144                    nr_data,
145                    s.nr_redundant);
146         bch2_prt_csum_type(out, s.csum_type);
147         prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
148 
149         for (unsigned i = 0; i < s.nr_blocks; i++) {
150                 const struct bch_extent_ptr *ptr = sp->ptrs + i;
151 
152                 if ((void *) ptr >= bkey_val_end(k))
153                         break;
154 
155                 bch2_extent_ptr_to_text(out, c, ptr);
156 
157                 if (s.csum_type < BCH_CSUM_NR &&
158                     i < nr_data &&
159                     stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
160                         prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
161         }
162 }
163 
164 /* Triggers: */
165 
166 static int __mark_stripe_bucket(struct btree_trans *trans,
167                                 struct bch_dev *ca,
168                                 struct bkey_s_c_stripe s,
169                                 unsigned ptr_idx, bool deleting,
170                                 struct bpos bucket,
171                                 struct bch_alloc_v4 *a,
172                                 enum btree_iter_update_trigger_flags flags)
173 {
174         const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
175         unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
176         bool parity = ptr_idx >= nr_data;
177         enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
178         s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
179         struct printbuf buf = PRINTBUF;
180         int ret = 0;
181 
182         struct bch_fs *c = trans->c;
183         if (deleting)
184                 sectors = -sectors;
185 
186         if (!deleting) {
187                 if (bch2_trans_inconsistent_on(a->stripe ||
188                                                a->stripe_redundancy, trans,
189                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
190                                 bucket.inode, bucket.offset, a->gen,
191                                 bch2_data_type_str(a->data_type),
192                                 a->dirty_sectors,
193                                 a->stripe, s.k->p.offset,
194                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
195                         ret = -EIO;
196                         goto err;
197                 }
198 
199                 if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
200                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
201                                 bucket.inode, bucket.offset, a->gen,
202                                 bch2_data_type_str(a->data_type),
203                                 a->dirty_sectors,
204                                 a->cached_sectors,
205                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
206                         ret = -EIO;
207                         goto err;
208                 }
209         } else {
210                 if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
211                                                a->stripe_redundancy != s.v->nr_redundant, trans,
212                                 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
213                                 bucket.inode, bucket.offset, a->gen,
214                                 a->stripe,
215                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
216                         ret = -EIO;
217                         goto err;
218                 }
219 
220                 if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
221                                 "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
222                                 bucket.inode, bucket.offset, a->gen,
223                                 bch2_data_type_str(a->data_type),
224                                 bch2_data_type_str(data_type),
225                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
226                         ret = -EIO;
227                         goto err;
228                 }
229 
230                 if (bch2_trans_inconsistent_on(parity &&
231                                                (a->dirty_sectors != -sectors ||
232                                                 a->cached_sectors), trans,
233                                 "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
234                                 bucket.inode, bucket.offset, a->gen,
235                                 a->dirty_sectors,
236                                 a->cached_sectors,
237                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
238                         ret = -EIO;
239                         goto err;
240                 }
241         }
242 
243         if (sectors) {
244                 ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
245                                              a->gen, a->data_type, &a->dirty_sectors);
246                 if (ret)
247                         goto err;
248         }
249 
250         if (!deleting) {
251                 a->stripe               = s.k->p.offset;
252                 a->stripe_redundancy    = s.v->nr_redundant;
253         } else {
254                 a->stripe               = 0;
255                 a->stripe_redundancy    = 0;
256         }
257 
258         alloc_data_type_set(a, data_type);
259 err:
260         printbuf_exit(&buf);
261         return ret;
262 }
263 
264 static int mark_stripe_bucket(struct btree_trans *trans,
265                               struct bkey_s_c_stripe s,
266                               unsigned ptr_idx, bool deleting,
267                               enum btree_iter_update_trigger_flags flags)
268 {
269         struct bch_fs *c = trans->c;
270         const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
271         struct printbuf buf = PRINTBUF;
272         int ret = 0;
273 
274         struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
275         if (unlikely(!ca)) {
276                 if (!(flags & BTREE_TRIGGER_overwrite))
277                         ret = -EIO;
278                 goto err;
279         }
280 
281         struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
282 
283         if (flags & BTREE_TRIGGER_transactional) {
284                 struct bkey_i_alloc_v4 *a =
285                         bch2_trans_start_alloc_update(trans, bucket, 0);
286                 ret = PTR_ERR_OR_ZERO(a) ?:
287                         __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
288         }
289 
290         if (flags & BTREE_TRIGGER_gc) {
291                 percpu_down_read(&c->mark_lock);
292                 struct bucket *g = gc_bucket(ca, bucket.offset);
293                 if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
294                                             ptr->dev,
295                                             (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
296                         ret = -EIO;
297                         goto err_unlock;
298                 }
299 
300                 bucket_lock(g);
301                 struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
302                 ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
303                 alloc_to_bucket(g, new);
304                 bucket_unlock(g);
305 err_unlock:
306                 percpu_up_read(&c->mark_lock);
307                 if (!ret)
308                         ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
309         }
310 err:
311         bch2_dev_put(ca);
312         printbuf_exit(&buf);
313         return ret;
314 }
315 
316 static int mark_stripe_buckets(struct btree_trans *trans,
317                                struct bkey_s_c old, struct bkey_s_c new,
318                                enum btree_iter_update_trigger_flags flags)
319 {
320         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
321                 ? bkey_s_c_to_stripe(old).v : NULL;
322         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
323                 ? bkey_s_c_to_stripe(new).v : NULL;
324 
325         BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
326 
327         unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
328 
329         for (unsigned i = 0; i < nr_blocks; i++) {
330                 if (new_s && old_s &&
331                     !memcmp(&new_s->ptrs[i],
332                             &old_s->ptrs[i],
333                             sizeof(new_s->ptrs[i])))
334                         continue;
335 
336                 if (new_s) {
337                         int ret = mark_stripe_bucket(trans,
338                                         bkey_s_c_to_stripe(new), i, false, flags);
339                         if (ret)
340                                 return ret;
341                 }
342 
343                 if (old_s) {
344                         int ret = mark_stripe_bucket(trans,
345                                         bkey_s_c_to_stripe(old), i, true, flags);
346                         if (ret)
347                                 return ret;
348                 }
349         }
350 
351         return 0;
352 }
353 
354 int bch2_trigger_stripe(struct btree_trans *trans,
355                         enum btree_id btree, unsigned level,
356                         struct bkey_s_c old, struct bkey_s _new,
357                         enum btree_iter_update_trigger_flags flags)
358 {
359         struct bkey_s_c new = _new.s_c;
360         struct bch_fs *c = trans->c;
361         u64 idx = new.k->p.offset;
362         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
363                 ? bkey_s_c_to_stripe(old).v : NULL;
364         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
365                 ? bkey_s_c_to_stripe(new).v : NULL;
366 
367         if (unlikely(flags & BTREE_TRIGGER_check_repair))
368                 return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
369 
370         BUG_ON(new_s && old_s &&
371                (new_s->nr_blocks        != old_s->nr_blocks ||
372                 new_s->nr_redundant     != old_s->nr_redundant));
373 
374 
375         if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
376                 /*
377                  * If the pointers aren't changing, we don't need to do anything:
378                  */
379                 if (new_s && old_s &&
380                     new_s->nr_blocks    == old_s->nr_blocks &&
381                     new_s->nr_redundant == old_s->nr_redundant &&
382                     !memcmp(old_s->ptrs, new_s->ptrs,
383                             new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
384                         return 0;
385 
386                 struct gc_stripe *gc = NULL;
387                 if (flags & BTREE_TRIGGER_gc) {
388                         gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
389                         if (!gc) {
390                                 bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
391                                 return -BCH_ERR_ENOMEM_mark_stripe;
392                         }
393 
394                         /*
395                          * This will be wrong when we bring back runtime gc: we should
396                          * be unmarking the old key and then marking the new key
397                          *
398                          * Also: when we bring back runtime gc, locking
399                          */
400                         gc->alive       = true;
401                         gc->sectors     = le16_to_cpu(new_s->sectors);
402                         gc->nr_blocks   = new_s->nr_blocks;
403                         gc->nr_redundant        = new_s->nr_redundant;
404 
405                         for (unsigned i = 0; i < new_s->nr_blocks; i++)
406                                 gc->ptrs[i] = new_s->ptrs[i];
407 
408                         /*
409                          * gc recalculates this field from stripe ptr
410                          * references:
411                          */
412                         memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
413                 }
414 
415                 if (new_s) {
416                         s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
417 
418                         struct disk_accounting_pos acc = {
419                                 .type = BCH_DISK_ACCOUNTING_replicas,
420                         };
421                         bch2_bkey_to_replicas(&acc.replicas, new);
422                         int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
423                         if (ret)
424                                 return ret;
425 
426                         if (gc)
427                                 memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
428                 }
429 
430                 if (old_s) {
431                         s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
432 
433                         struct disk_accounting_pos acc = {
434                                 .type = BCH_DISK_ACCOUNTING_replicas,
435                         };
436                         bch2_bkey_to_replicas(&acc.replicas, old);
437                         int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
438                         if (ret)
439                                 return ret;
440                 }
441 
442                 int ret = mark_stripe_buckets(trans, old, new, flags);
443                 if (ret)
444                         return ret;
445         }
446 
447         if (flags & BTREE_TRIGGER_atomic) {
448                 struct stripe *m = genradix_ptr(&c->stripes, idx);
449 
450                 if (!m) {
451                         struct printbuf buf1 = PRINTBUF;
452                         struct printbuf buf2 = PRINTBUF;
453 
454                         bch2_bkey_val_to_text(&buf1, c, old);
455                         bch2_bkey_val_to_text(&buf2, c, new);
456                         bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
457                                             "old %s\n"
458                                             "new %s", idx, buf1.buf, buf2.buf);
459                         printbuf_exit(&buf2);
460                         printbuf_exit(&buf1);
461                         bch2_inconsistent_error(c);
462                         return -1;
463                 }
464 
465                 if (!new_s) {
466                         bch2_stripes_heap_del(c, m, idx);
467 
468                         memset(m, 0, sizeof(*m));
469                 } else {
470                         m->sectors      = le16_to_cpu(new_s->sectors);
471                         m->algorithm    = new_s->algorithm;
472                         m->nr_blocks    = new_s->nr_blocks;
473                         m->nr_redundant = new_s->nr_redundant;
474                         m->blocks_nonempty = 0;
475 
476                         for (unsigned i = 0; i < new_s->nr_blocks; i++)
477                                 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
478 
479                         if (!old_s)
480                                 bch2_stripes_heap_insert(c, m, idx);
481                         else
482                                 bch2_stripes_heap_update(c, m, idx);
483                 }
484         }
485 
486         return 0;
487 }
488 
489 /* returns blocknr in stripe that we matched: */
490 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
491                                                 struct bkey_s_c k, unsigned *block)
492 {
493         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
494         unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
495 
496         bkey_for_each_ptr(ptrs, ptr)
497                 for (i = 0; i < nr_data; i++)
498                         if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
499                                                       le16_to_cpu(s->sectors))) {
500                                 *block = i;
501                                 return ptr;
502                         }
503 
504         return NULL;
505 }
506 
507 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
508 {
509         switch (k.k->type) {
510         case KEY_TYPE_extent: {
511                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
512                 const union bch_extent_entry *entry;
513 
514                 extent_for_each_entry(e, entry)
515                         if (extent_entry_type(entry) ==
516                             BCH_EXTENT_ENTRY_stripe_ptr &&
517                             entry->stripe_ptr.idx == idx)
518                                 return true;
519 
520                 break;
521         }
522         }
523 
524         return false;
525 }
526 
527 /* Stripe bufs: */
528 
529 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
530 {
531         if (buf->key.k.type == KEY_TYPE_stripe) {
532                 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
533                 unsigned i;
534 
535                 for (i = 0; i < s->v.nr_blocks; i++) {
536                         kvfree(buf->data[i]);
537                         buf->data[i] = NULL;
538                 }
539         }
540 }
541 
542 /* XXX: this is a non-mempoolified memory allocation: */
543 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
544                               unsigned offset, unsigned size)
545 {
546         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
547         unsigned csum_granularity = 1U << v->csum_granularity_bits;
548         unsigned end = offset + size;
549         unsigned i;
550 
551         BUG_ON(end > le16_to_cpu(v->sectors));
552 
553         offset  = round_down(offset, csum_granularity);
554         end     = min_t(unsigned, le16_to_cpu(v->sectors),
555                         round_up(end, csum_granularity));
556 
557         buf->offset     = offset;
558         buf->size       = end - offset;
559 
560         memset(buf->valid, 0xFF, sizeof(buf->valid));
561 
562         for (i = 0; i < v->nr_blocks; i++) {
563                 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
564                 if (!buf->data[i])
565                         goto err;
566         }
567 
568         return 0;
569 err:
570         ec_stripe_buf_exit(buf);
571         return -BCH_ERR_ENOMEM_stripe_buf;
572 }
573 
574 /* Checksumming: */
575 
576 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
577                                          unsigned block, unsigned offset)
578 {
579         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
580         unsigned csum_granularity = 1 << v->csum_granularity_bits;
581         unsigned end = buf->offset + buf->size;
582         unsigned len = min(csum_granularity, end - offset);
583 
584         BUG_ON(offset >= end);
585         BUG_ON(offset <  buf->offset);
586         BUG_ON(offset & (csum_granularity - 1));
587         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
588                (len & (csum_granularity - 1)));
589 
590         return bch2_checksum(NULL, v->csum_type,
591                              null_nonce(),
592                              buf->data[block] + ((offset - buf->offset) << 9),
593                              len << 9);
594 }
595 
596 static void ec_generate_checksums(struct ec_stripe_buf *buf)
597 {
598         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
599         unsigned i, j, csums_per_device = stripe_csums_per_device(v);
600 
601         if (!v->csum_type)
602                 return;
603 
604         BUG_ON(buf->offset);
605         BUG_ON(buf->size != le16_to_cpu(v->sectors));
606 
607         for (i = 0; i < v->nr_blocks; i++)
608                 for (j = 0; j < csums_per_device; j++)
609                         stripe_csum_set(v, i, j,
610                                 ec_block_checksum(buf, i, j << v->csum_granularity_bits));
611 }
612 
613 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
614 {
615         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
616         unsigned csum_granularity = 1 << v->csum_granularity_bits;
617         unsigned i;
618 
619         if (!v->csum_type)
620                 return;
621 
622         for (i = 0; i < v->nr_blocks; i++) {
623                 unsigned offset = buf->offset;
624                 unsigned end = buf->offset + buf->size;
625 
626                 if (!test_bit(i, buf->valid))
627                         continue;
628 
629                 while (offset < end) {
630                         unsigned j = offset >> v->csum_granularity_bits;
631                         unsigned len = min(csum_granularity, end - offset);
632                         struct bch_csum want = stripe_csum_get(v, i, j);
633                         struct bch_csum got = ec_block_checksum(buf, i, offset);
634 
635                         if (bch2_crc_cmp(want, got)) {
636                                 struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
637                                 if (ca) {
638                                         struct printbuf err = PRINTBUF;
639 
640                                         prt_str(&err, "stripe ");
641                                         bch2_csum_err_msg(&err, v->csum_type, want, got);
642                                         prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
643                                         bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
644                                         bch_err_ratelimited(ca, "%s", err.buf);
645                                         printbuf_exit(&err);
646 
647                                         bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
648                                 }
649 
650                                 clear_bit(i, buf->valid);
651                                 break;
652                         }
653 
654                         offset += len;
655                 }
656         }
657 }
658 
659 /* Erasure coding: */
660 
661 static void ec_generate_ec(struct ec_stripe_buf *buf)
662 {
663         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
664         unsigned nr_data = v->nr_blocks - v->nr_redundant;
665         unsigned bytes = le16_to_cpu(v->sectors) << 9;
666 
667         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
668 }
669 
670 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
671 {
672         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
673 
674         return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
675 }
676 
677 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
678 {
679         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
680         unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
681         unsigned nr_data = v->nr_blocks - v->nr_redundant;
682         unsigned bytes = buf->size << 9;
683 
684         if (ec_nr_failed(buf) > v->nr_redundant) {
685                 bch_err_ratelimited(c,
686                         "error doing reconstruct read: unable to read enough blocks");
687                 return -1;
688         }
689 
690         for (i = 0; i < nr_data; i++)
691                 if (!test_bit(i, buf->valid))
692                         failed[nr_failed++] = i;
693 
694         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
695         return 0;
696 }
697 
698 /* IO: */
699 
700 static void ec_block_endio(struct bio *bio)
701 {
702         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
703         struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
704         struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
705         struct bch_dev *ca = ec_bio->ca;
706         struct closure *cl = bio->bi_private;
707 
708         if (bch2_dev_io_err_on(bio->bi_status, ca,
709                                bio_data_dir(bio)
710                                ? BCH_MEMBER_ERROR_write
711                                : BCH_MEMBER_ERROR_read,
712                                "erasure coding %s error: %s",
713                                bio_data_dir(bio) ? "write" : "read",
714                                bch2_blk_status_to_str(bio->bi_status)))
715                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
716 
717         int stale = dev_ptr_stale(ca, ptr);
718         if (stale) {
719                 bch_err_ratelimited(ca->fs,
720                                     "error %s stripe: stale/invalid pointer (%i) after io",
721                                     bio_data_dir(bio) == READ ? "reading from" : "writing to",
722                                     stale);
723                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
724         }
725 
726         bio_put(&ec_bio->bio);
727         percpu_ref_put(&ca->io_ref);
728         closure_put(cl);
729 }
730 
731 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
732                         blk_opf_t opf, unsigned idx, struct closure *cl)
733 {
734         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
735         unsigned offset = 0, bytes = buf->size << 9;
736         struct bch_extent_ptr *ptr = &v->ptrs[idx];
737         enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
738                 ? BCH_DATA_user
739                 : BCH_DATA_parity;
740         int rw = op_is_write(opf);
741 
742         struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
743         if (!ca) {
744                 clear_bit(idx, buf->valid);
745                 return;
746         }
747 
748         int stale = dev_ptr_stale(ca, ptr);
749         if (stale) {
750                 bch_err_ratelimited(c,
751                                     "error %s stripe: stale pointer (%i)",
752                                     rw == READ ? "reading from" : "writing to",
753                                     stale);
754                 clear_bit(idx, buf->valid);
755                 return;
756         }
757 
758 
759         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
760 
761         while (offset < bytes) {
762                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
763                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
764                 unsigned b = min_t(size_t, bytes - offset,
765                                    nr_iovecs << PAGE_SHIFT);
766                 struct ec_bio *ec_bio;
767 
768                 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
769                                                        nr_iovecs,
770                                                        opf,
771                                                        GFP_KERNEL,
772                                                        &c->ec_bioset),
773                                       struct ec_bio, bio);
774 
775                 ec_bio->ca                      = ca;
776                 ec_bio->buf                     = buf;
777                 ec_bio->idx                     = idx;
778 
779                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
780                 ec_bio->bio.bi_end_io           = ec_block_endio;
781                 ec_bio->bio.bi_private          = cl;
782 
783                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
784 
785                 closure_get(cl);
786                 percpu_ref_get(&ca->io_ref);
787 
788                 submit_bio(&ec_bio->bio);
789 
790                 offset += b;
791         }
792 
793         percpu_ref_put(&ca->io_ref);
794 }
795 
796 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
797                                 struct ec_stripe_buf *stripe)
798 {
799         struct btree_iter iter;
800         struct bkey_s_c k;
801         int ret;
802 
803         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
804                                POS(0, idx), BTREE_ITER_slots);
805         ret = bkey_err(k);
806         if (ret)
807                 goto err;
808         if (k.k->type != KEY_TYPE_stripe) {
809                 ret = -ENOENT;
810                 goto err;
811         }
812         bkey_reassemble(&stripe->key, k);
813 err:
814         bch2_trans_iter_exit(trans, &iter);
815         return ret;
816 }
817 
818 /* recovery read path: */
819 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
820 {
821         struct bch_fs *c = trans->c;
822         struct ec_stripe_buf *buf;
823         struct closure cl;
824         struct bch_stripe *v;
825         unsigned i, offset;
826         int ret = 0;
827 
828         closure_init_stack(&cl);
829 
830         BUG_ON(!rbio->pick.has_ec);
831 
832         buf = kzalloc(sizeof(*buf), GFP_NOFS);
833         if (!buf)
834                 return -BCH_ERR_ENOMEM_ec_read_extent;
835 
836         ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
837         if (ret) {
838                 bch_err_ratelimited(c,
839                         "error doing reconstruct read: error %i looking up stripe", ret);
840                 kfree(buf);
841                 return -EIO;
842         }
843 
844         v = &bkey_i_to_stripe(&buf->key)->v;
845 
846         if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
847                 bch_err_ratelimited(c,
848                         "error doing reconstruct read: pointer doesn't match stripe");
849                 ret = -EIO;
850                 goto err;
851         }
852 
853         offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
854         if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
855                 bch_err_ratelimited(c,
856                         "error doing reconstruct read: read is bigger than stripe");
857                 ret = -EIO;
858                 goto err;
859         }
860 
861         ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
862         if (ret)
863                 goto err;
864 
865         for (i = 0; i < v->nr_blocks; i++)
866                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
867 
868         closure_sync(&cl);
869 
870         if (ec_nr_failed(buf) > v->nr_redundant) {
871                 bch_err_ratelimited(c,
872                         "error doing reconstruct read: unable to read enough blocks");
873                 ret = -EIO;
874                 goto err;
875         }
876 
877         ec_validate_checksums(c, buf);
878 
879         ret = ec_do_recov(c, buf);
880         if (ret)
881                 goto err;
882 
883         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
884                       buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
885 err:
886         ec_stripe_buf_exit(buf);
887         kfree(buf);
888         return ret;
889 }
890 
891 /* stripe bucket accounting: */
892 
893 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
894 {
895         ec_stripes_heap n, *h = &c->ec_stripes_heap;
896 
897         if (idx >= h->size) {
898                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
899                         return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
900 
901                 mutex_lock(&c->ec_stripes_heap_lock);
902                 if (n.size > h->size) {
903                         memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
904                         n.nr = h->nr;
905                         swap(*h, n);
906                 }
907                 mutex_unlock(&c->ec_stripes_heap_lock);
908 
909                 free_heap(&n);
910         }
911 
912         if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
913                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
914 
915         if (c->gc_pos.phase != GC_PHASE_not_running &&
916             !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
917                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
918 
919         return 0;
920 }
921 
922 static int ec_stripe_mem_alloc(struct btree_trans *trans,
923                                struct btree_iter *iter)
924 {
925         return allocate_dropping_locks_errcode(trans,
926                         __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
927 }
928 
929 /*
930  * Hash table of open stripes:
931  * Stripes that are being created or modified are kept in a hash table, so that
932  * stripe deletion can skip them.
933  */
934 
935 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
936 {
937         unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
938         struct ec_stripe_new *s;
939 
940         hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
941                 if (s->idx == idx)
942                         return true;
943         return false;
944 }
945 
946 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
947 {
948         bool ret = false;
949 
950         spin_lock(&c->ec_stripes_new_lock);
951         ret = __bch2_stripe_is_open(c, idx);
952         spin_unlock(&c->ec_stripes_new_lock);
953 
954         return ret;
955 }
956 
957 static bool bch2_try_open_stripe(struct bch_fs *c,
958                                  struct ec_stripe_new *s,
959                                  u64 idx)
960 {
961         bool ret;
962 
963         spin_lock(&c->ec_stripes_new_lock);
964         ret = !__bch2_stripe_is_open(c, idx);
965         if (ret) {
966                 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
967 
968                 s->idx = idx;
969                 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
970         }
971         spin_unlock(&c->ec_stripes_new_lock);
972 
973         return ret;
974 }
975 
976 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
977 {
978         BUG_ON(!s->idx);
979 
980         spin_lock(&c->ec_stripes_new_lock);
981         hlist_del_init(&s->hash);
982         spin_unlock(&c->ec_stripes_new_lock);
983 
984         s->idx = 0;
985 }
986 
987 /* Heap of all existing stripes, ordered by blocks_nonempty */
988 
989 static u64 stripe_idx_to_delete(struct bch_fs *c)
990 {
991         ec_stripes_heap *h = &c->ec_stripes_heap;
992 
993         lockdep_assert_held(&c->ec_stripes_heap_lock);
994 
995         if (h->nr &&
996             h->data[0].blocks_nonempty == 0 &&
997             !bch2_stripe_is_open(c, h->data[0].idx))
998                 return h->data[0].idx;
999 
1000         return 0;
1001 }
1002 
1003 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
1004                                                    size_t i)
1005 {
1006         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
1007 
1008         genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
1009 }
1010 
1011 static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
1012 {
1013         struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
1014         struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
1015 
1016         return ((_l->blocks_nonempty > _r->blocks_nonempty) <
1017                 (_l->blocks_nonempty < _r->blocks_nonempty));
1018 }
1019 
1020 static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
1021 {
1022         struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
1023         struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
1024         ec_stripes_heap *_h = (ec_stripes_heap *)h;
1025         size_t i = _l - _h->data;
1026         size_t j = _r - _h->data;
1027 
1028         swap(*_l, *_r);
1029 
1030         ec_stripes_heap_set_backpointer(_h, i);
1031         ec_stripes_heap_set_backpointer(_h, j);
1032 }
1033 
1034 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
1035 {
1036         ec_stripes_heap *h = &c->ec_stripes_heap;
1037         struct stripe *m = genradix_ptr(&c->stripes, idx);
1038 
1039         BUG_ON(m->heap_idx >= h->nr);
1040         BUG_ON(h->data[m->heap_idx].idx != idx);
1041 }
1042 
1043 void bch2_stripes_heap_del(struct bch_fs *c,
1044                            struct stripe *m, size_t idx)
1045 {
1046         const struct min_heap_callbacks callbacks = {
1047                 .less = ec_stripes_heap_cmp,
1048                 .swp = ec_stripes_heap_swap,
1049         };
1050 
1051         mutex_lock(&c->ec_stripes_heap_lock);
1052         heap_verify_backpointer(c, idx);
1053 
1054         min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
1055         mutex_unlock(&c->ec_stripes_heap_lock);
1056 }
1057 
1058 void bch2_stripes_heap_insert(struct bch_fs *c,
1059                               struct stripe *m, size_t idx)
1060 {
1061         const struct min_heap_callbacks callbacks = {
1062                 .less = ec_stripes_heap_cmp,
1063                 .swp = ec_stripes_heap_swap,
1064         };
1065 
1066         mutex_lock(&c->ec_stripes_heap_lock);
1067         BUG_ON(min_heap_full(&c->ec_stripes_heap));
1068 
1069         genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
1070         min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
1071                         .idx = idx,
1072                         .blocks_nonempty = m->blocks_nonempty,
1073                 }),
1074                 &callbacks,
1075                 &c->ec_stripes_heap);
1076 
1077         heap_verify_backpointer(c, idx);
1078         mutex_unlock(&c->ec_stripes_heap_lock);
1079 }
1080 
1081 void bch2_stripes_heap_update(struct bch_fs *c,
1082                               struct stripe *m, size_t idx)
1083 {
1084         const struct min_heap_callbacks callbacks = {
1085                 .less = ec_stripes_heap_cmp,
1086                 .swp = ec_stripes_heap_swap,
1087         };
1088         ec_stripes_heap *h = &c->ec_stripes_heap;
1089         bool do_deletes;
1090         size_t i;
1091 
1092         mutex_lock(&c->ec_stripes_heap_lock);
1093         heap_verify_backpointer(c, idx);
1094 
1095         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
1096 
1097         i = m->heap_idx;
1098         min_heap_sift_up(h,     i, &callbacks, &c->ec_stripes_heap);
1099         min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
1100 
1101         heap_verify_backpointer(c, idx);
1102 
1103         do_deletes = stripe_idx_to_delete(c) != 0;
1104         mutex_unlock(&c->ec_stripes_heap_lock);
1105 
1106         if (do_deletes)
1107                 bch2_do_stripe_deletes(c);
1108 }
1109 
1110 /* stripe deletion */
1111 
1112 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
1113 {
1114         struct bch_fs *c = trans->c;
1115         struct btree_iter iter;
1116         struct bkey_s_c k;
1117         struct bkey_s_c_stripe s;
1118         int ret;
1119 
1120         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
1121                                BTREE_ITER_intent);
1122         ret = bkey_err(k);
1123         if (ret)
1124                 goto err;
1125 
1126         if (k.k->type != KEY_TYPE_stripe) {
1127                 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
1128                 ret = -EINVAL;
1129                 goto err;
1130         }
1131 
1132         s = bkey_s_c_to_stripe(k);
1133         for (unsigned i = 0; i < s.v->nr_blocks; i++)
1134                 if (stripe_blockcount_get(s.v, i)) {
1135                         struct printbuf buf = PRINTBUF;
1136 
1137                         bch2_bkey_val_to_text(&buf, c, k);
1138                         bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
1139                         printbuf_exit(&buf);
1140                         ret = -EINVAL;
1141                         goto err;
1142                 }
1143 
1144         ret = bch2_btree_delete_at(trans, &iter, 0);
1145 err:
1146         bch2_trans_iter_exit(trans, &iter);
1147         return ret;
1148 }
1149 
1150 static void ec_stripe_delete_work(struct work_struct *work)
1151 {
1152         struct bch_fs *c =
1153                 container_of(work, struct bch_fs, ec_stripe_delete_work);
1154 
1155         while (1) {
1156                 mutex_lock(&c->ec_stripes_heap_lock);
1157                 u64 idx = stripe_idx_to_delete(c);
1158                 mutex_unlock(&c->ec_stripes_heap_lock);
1159 
1160                 if (!idx)
1161                         break;
1162 
1163                 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1164                                         ec_stripe_delete(trans, idx));
1165                 bch_err_fn(c, ret);
1166                 if (ret)
1167                         break;
1168         }
1169 
1170         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1171 }
1172 
1173 void bch2_do_stripe_deletes(struct bch_fs *c)
1174 {
1175         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1176             !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1177                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1178 }
1179 
1180 /* stripe creation: */
1181 
1182 static int ec_stripe_key_update(struct btree_trans *trans,
1183                                 struct bkey_i_stripe *new,
1184                                 bool create)
1185 {
1186         struct bch_fs *c = trans->c;
1187         struct btree_iter iter;
1188         struct bkey_s_c k;
1189         int ret;
1190 
1191         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1192                                new->k.p, BTREE_ITER_intent);
1193         ret = bkey_err(k);
1194         if (ret)
1195                 goto err;
1196 
1197         if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
1198                 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
1199                                      create ? "creating" : "updating",
1200                                      bch2_bkey_types[k.k->type]);
1201                 ret = -EINVAL;
1202                 goto err;
1203         }
1204 
1205         if (k.k->type == KEY_TYPE_stripe) {
1206                 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
1207                 unsigned i;
1208 
1209                 if (old->nr_blocks != new->v.nr_blocks) {
1210                         bch_err(c, "error updating stripe: nr_blocks does not match");
1211                         ret = -EINVAL;
1212                         goto err;
1213                 }
1214 
1215                 for (i = 0; i < new->v.nr_blocks; i++) {
1216                         unsigned v = stripe_blockcount_get(old, i);
1217 
1218                         BUG_ON(v &&
1219                                (old->ptrs[i].dev != new->v.ptrs[i].dev ||
1220                                 old->ptrs[i].gen != new->v.ptrs[i].gen ||
1221                                 old->ptrs[i].offset != new->v.ptrs[i].offset));
1222 
1223                         stripe_blockcount_set(&new->v, i, v);
1224                 }
1225         }
1226 
1227         ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1228 err:
1229         bch2_trans_iter_exit(trans, &iter);
1230         return ret;
1231 }
1232 
1233 static int ec_stripe_update_extent(struct btree_trans *trans,
1234                                    struct bch_dev *ca,
1235                                    struct bpos bucket, u8 gen,
1236                                    struct ec_stripe_buf *s,
1237                                    struct bpos *bp_pos)
1238 {
1239         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1240         struct bch_fs *c = trans->c;
1241         struct bch_backpointer bp;
1242         struct btree_iter iter;
1243         struct bkey_s_c k;
1244         const struct bch_extent_ptr *ptr_c;
1245         struct bch_extent_ptr *ec_ptr = NULL;
1246         struct bch_extent_stripe_ptr stripe_ptr;
1247         struct bkey_i *n;
1248         int ret, dev, block;
1249 
1250         ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
1251                                 bp_pos, &bp, BTREE_ITER_cached);
1252         if (ret)
1253                 return ret;
1254         if (bpos_eq(*bp_pos, SPOS_MAX))
1255                 return 0;
1256 
1257         if (bp.level) {
1258                 struct printbuf buf = PRINTBUF;
1259                 struct btree_iter node_iter;
1260                 struct btree *b;
1261 
1262                 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
1263                 bch2_trans_iter_exit(trans, &node_iter);
1264 
1265                 if (!b)
1266                         return 0;
1267 
1268                 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1269                 bch2_backpointer_to_text(&buf, &bp);
1270 
1271                 bch2_fs_inconsistent(c, "%s", buf.buf);
1272                 printbuf_exit(&buf);
1273                 return -EIO;
1274         }
1275 
1276         k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
1277         ret = bkey_err(k);
1278         if (ret)
1279                 return ret;
1280         if (!k.k) {
1281                 /*
1282                  * extent no longer exists - we could flush the btree
1283                  * write buffer and retry to verify, but no need:
1284                  */
1285                 return 0;
1286         }
1287 
1288         if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1289                 goto out;
1290 
1291         ptr_c = bkey_matches_stripe(v, k, &block);
1292         /*
1293          * It doesn't generally make sense to erasure code cached ptrs:
1294          * XXX: should we be incrementing a counter?
1295          */
1296         if (!ptr_c || ptr_c->cached)
1297                 goto out;
1298 
1299         dev = v->ptrs[block].dev;
1300 
1301         n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1302         ret = PTR_ERR_OR_ZERO(n);
1303         if (ret)
1304                 goto out;
1305 
1306         bkey_reassemble(n, k);
1307 
1308         bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
1309         ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1310         BUG_ON(!ec_ptr);
1311 
1312         stripe_ptr = (struct bch_extent_stripe_ptr) {
1313                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1314                 .block          = block,
1315                 .redundancy     = v->nr_redundant,
1316                 .idx            = s->key.k.p.offset,
1317         };
1318 
1319         __extent_entry_insert(n,
1320                         (union bch_extent_entry *) ec_ptr,
1321                         (union bch_extent_entry *) &stripe_ptr);
1322 
1323         ret = bch2_trans_update(trans, &iter, n, 0);
1324 out:
1325         bch2_trans_iter_exit(trans, &iter);
1326         return ret;
1327 }
1328 
1329 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1330                                    unsigned block)
1331 {
1332         struct bch_fs *c = trans->c;
1333         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1334         struct bch_extent_ptr ptr = v->ptrs[block];
1335         struct bpos bp_pos = POS_MIN;
1336         int ret = 0;
1337 
1338         struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
1339         if (!ca)
1340                 return -EIO;
1341 
1342         struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
1343 
1344         while (1) {
1345                 ret = commit_do(trans, NULL, NULL,
1346                                 BCH_TRANS_COMMIT_no_check_rw|
1347                                 BCH_TRANS_COMMIT_no_enospc,
1348                         ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
1349                 if (ret)
1350                         break;
1351                 if (bkey_eq(bp_pos, POS_MAX))
1352                         break;
1353 
1354                 bp_pos = bpos_nosnap_successor(bp_pos);
1355         }
1356 
1357         bch2_dev_put(ca);
1358         return ret;
1359 }
1360 
1361 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1362 {
1363         struct btree_trans *trans = bch2_trans_get(c);
1364         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1365         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1366         int ret = 0;
1367 
1368         ret = bch2_btree_write_buffer_flush_sync(trans);
1369         if (ret)
1370                 goto err;
1371 
1372         for (i = 0; i < nr_data; i++) {
1373                 ret = ec_stripe_update_bucket(trans, s, i);
1374                 if (ret)
1375                         break;
1376         }
1377 err:
1378         bch2_trans_put(trans);
1379 
1380         return ret;
1381 }
1382 
1383 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1384                                        struct ec_stripe_new *s,
1385                                        unsigned block,
1386                                        struct open_bucket *ob)
1387 {
1388         struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
1389         if (!ca) {
1390                 s->err = -BCH_ERR_erofs_no_writes;
1391                 return;
1392         }
1393 
1394         unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1395         memset(s->new_stripe.data[block] + (offset << 9),
1396                0,
1397                ob->sectors_free << 9);
1398 
1399         int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1400                         ob->bucket * ca->mi.bucket_size + offset,
1401                         ob->sectors_free,
1402                         GFP_KERNEL, 0);
1403 
1404         percpu_ref_put(&ca->io_ref);
1405 
1406         if (ret)
1407                 s->err = ret;
1408 }
1409 
1410 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1411 {
1412         if (s->idx)
1413                 bch2_stripe_close(c, s);
1414         kfree(s);
1415 }
1416 
1417 /*
1418  * data buckets of new stripe all written: create the stripe
1419  */
1420 static void ec_stripe_create(struct ec_stripe_new *s)
1421 {
1422         struct bch_fs *c = s->c;
1423         struct open_bucket *ob;
1424         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1425         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1426         int ret;
1427 
1428         BUG_ON(s->h->s == s);
1429 
1430         closure_sync(&s->iodone);
1431 
1432         if (!s->err) {
1433                 for (i = 0; i < nr_data; i++)
1434                         if (s->blocks[i]) {
1435                                 ob = c->open_buckets + s->blocks[i];
1436 
1437                                 if (ob->sectors_free)
1438                                         zero_out_rest_of_ec_bucket(c, s, i, ob);
1439                         }
1440         }
1441 
1442         if (s->err) {
1443                 if (!bch2_err_matches(s->err, EROFS))
1444                         bch_err(c, "error creating stripe: error writing data buckets");
1445                 goto err;
1446         }
1447 
1448         if (s->have_existing_stripe) {
1449                 ec_validate_checksums(c, &s->existing_stripe);
1450 
1451                 if (ec_do_recov(c, &s->existing_stripe)) {
1452                         bch_err(c, "error creating stripe: error reading existing stripe");
1453                         goto err;
1454                 }
1455 
1456                 for (i = 0; i < nr_data; i++)
1457                         if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1458                                 swap(s->new_stripe.data[i],
1459                                      s->existing_stripe.data[i]);
1460 
1461                 ec_stripe_buf_exit(&s->existing_stripe);
1462         }
1463 
1464         BUG_ON(!s->allocated);
1465         BUG_ON(!s->idx);
1466 
1467         ec_generate_ec(&s->new_stripe);
1468 
1469         ec_generate_checksums(&s->new_stripe);
1470 
1471         /* write p/q: */
1472         for (i = nr_data; i < v->nr_blocks; i++)
1473                 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1474         closure_sync(&s->iodone);
1475 
1476         if (ec_nr_failed(&s->new_stripe)) {
1477                 bch_err(c, "error creating stripe: error writing redundancy buckets");
1478                 goto err;
1479         }
1480 
1481         ret = bch2_trans_do(c, &s->res, NULL,
1482                             BCH_TRANS_COMMIT_no_check_rw|
1483                             BCH_TRANS_COMMIT_no_enospc,
1484                             ec_stripe_key_update(trans,
1485                                         bkey_i_to_stripe(&s->new_stripe.key),
1486                                         !s->have_existing_stripe));
1487         bch_err_msg(c, ret, "creating stripe key");
1488         if (ret) {
1489                 goto err;
1490         }
1491 
1492         ret = ec_stripe_update_extents(c, &s->new_stripe);
1493         bch_err_msg(c, ret, "error updating extents");
1494         if (ret)
1495                 goto err;
1496 err:
1497         bch2_disk_reservation_put(c, &s->res);
1498 
1499         for (i = 0; i < v->nr_blocks; i++)
1500                 if (s->blocks[i]) {
1501                         ob = c->open_buckets + s->blocks[i];
1502 
1503                         if (i < nr_data) {
1504                                 ob->ec = NULL;
1505                                 __bch2_open_bucket_put(c, ob);
1506                         } else {
1507                                 bch2_open_bucket_put(c, ob);
1508                         }
1509                 }
1510 
1511         mutex_lock(&c->ec_stripe_new_lock);
1512         list_del(&s->list);
1513         mutex_unlock(&c->ec_stripe_new_lock);
1514         wake_up(&c->ec_stripe_new_wait);
1515 
1516         ec_stripe_buf_exit(&s->existing_stripe);
1517         ec_stripe_buf_exit(&s->new_stripe);
1518         closure_debug_destroy(&s->iodone);
1519 
1520         ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1521 }
1522 
1523 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1524 {
1525         struct ec_stripe_new *s;
1526 
1527         mutex_lock(&c->ec_stripe_new_lock);
1528         list_for_each_entry(s, &c->ec_stripe_new_list, list)
1529                 if (!atomic_read(&s->ref[STRIPE_REF_io]))
1530                         goto out;
1531         s = NULL;
1532 out:
1533         mutex_unlock(&c->ec_stripe_new_lock);
1534 
1535         return s;
1536 }
1537 
1538 static void ec_stripe_create_work(struct work_struct *work)
1539 {
1540         struct bch_fs *c = container_of(work,
1541                 struct bch_fs, ec_stripe_create_work);
1542         struct ec_stripe_new *s;
1543 
1544         while ((s = get_pending_stripe(c)))
1545                 ec_stripe_create(s);
1546 
1547         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1548 }
1549 
1550 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1551 {
1552         bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1553 
1554         if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1555                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1556 }
1557 
1558 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1559 {
1560         struct ec_stripe_new *s = h->s;
1561 
1562         BUG_ON(!s->allocated && !s->err);
1563 
1564         h->s            = NULL;
1565         s->pending      = true;
1566 
1567         mutex_lock(&c->ec_stripe_new_lock);
1568         list_add(&s->list, &c->ec_stripe_new_list);
1569         mutex_unlock(&c->ec_stripe_new_lock);
1570 
1571         ec_stripe_new_put(c, s, STRIPE_REF_io);
1572 }
1573 
1574 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1575 {
1576         struct ec_stripe_new *s = ob->ec;
1577 
1578         s->err = -EIO;
1579 }
1580 
1581 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1582 {
1583         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1584         if (!ob)
1585                 return NULL;
1586 
1587         BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1588 
1589         struct bch_dev *ca      = ob_dev(c, ob);
1590         unsigned offset         = ca->mi.bucket_size - ob->sectors_free;
1591 
1592         return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1593 }
1594 
1595 static int unsigned_cmp(const void *_l, const void *_r)
1596 {
1597         unsigned l = *((const unsigned *) _l);
1598         unsigned r = *((const unsigned *) _r);
1599 
1600         return cmp_int(l, r);
1601 }
1602 
1603 /* pick most common bucket size: */
1604 static unsigned pick_blocksize(struct bch_fs *c,
1605                                struct bch_devs_mask *devs)
1606 {
1607         unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1608         struct {
1609                 unsigned nr, size;
1610         } cur = { 0, 0 }, best = { 0, 0 };
1611 
1612         for_each_member_device_rcu(c, ca, devs)
1613                 sizes[nr++] = ca->mi.bucket_size;
1614 
1615         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1616 
1617         for (unsigned i = 0; i < nr; i++) {
1618                 if (sizes[i] != cur.size) {
1619                         if (cur.nr > best.nr)
1620                                 best = cur;
1621 
1622                         cur.nr = 0;
1623                         cur.size = sizes[i];
1624                 }
1625 
1626                 cur.nr++;
1627         }
1628 
1629         if (cur.nr > best.nr)
1630                 best = cur;
1631 
1632         return best.size;
1633 }
1634 
1635 static bool may_create_new_stripe(struct bch_fs *c)
1636 {
1637         return false;
1638 }
1639 
1640 static void ec_stripe_key_init(struct bch_fs *c,
1641                                struct bkey_i *k,
1642                                unsigned nr_data,
1643                                unsigned nr_parity,
1644                                unsigned stripe_size)
1645 {
1646         struct bkey_i_stripe *s = bkey_stripe_init(k);
1647         unsigned u64s;
1648 
1649         s->v.sectors                    = cpu_to_le16(stripe_size);
1650         s->v.algorithm                  = 0;
1651         s->v.nr_blocks                  = nr_data + nr_parity;
1652         s->v.nr_redundant               = nr_parity;
1653         s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
1654         s->v.csum_type                  = BCH_CSUM_crc32c;
1655         s->v.pad                        = 0;
1656 
1657         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1658                 BUG_ON(1 << s->v.csum_granularity_bits >=
1659                        le16_to_cpu(s->v.sectors) ||
1660                        s->v.csum_granularity_bits == U8_MAX);
1661                 s->v.csum_granularity_bits++;
1662         }
1663 
1664         set_bkey_val_u64s(&s->k, u64s);
1665 }
1666 
1667 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1668 {
1669         struct ec_stripe_new *s;
1670 
1671         lockdep_assert_held(&h->lock);
1672 
1673         s = kzalloc(sizeof(*s), GFP_KERNEL);
1674         if (!s)
1675                 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1676 
1677         mutex_init(&s->lock);
1678         closure_init(&s->iodone, NULL);
1679         atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1680         atomic_set(&s->ref[STRIPE_REF_io], 1);
1681         s->c            = c;
1682         s->h            = h;
1683         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1684                                 BCH_BKEY_PTRS_MAX) - h->redundancy;
1685         s->nr_parity    = h->redundancy;
1686 
1687         ec_stripe_key_init(c, &s->new_stripe.key,
1688                            s->nr_data, s->nr_parity, h->blocksize);
1689 
1690         h->s = s;
1691         return 0;
1692 }
1693 
1694 static struct ec_stripe_head *
1695 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1696                          unsigned algo, unsigned redundancy,
1697                          enum bch_watermark watermark)
1698 {
1699         struct ec_stripe_head *h;
1700 
1701         h = kzalloc(sizeof(*h), GFP_KERNEL);
1702         if (!h)
1703                 return NULL;
1704 
1705         mutex_init(&h->lock);
1706         BUG_ON(!mutex_trylock(&h->lock));
1707 
1708         h->target       = target;
1709         h->algo         = algo;
1710         h->redundancy   = redundancy;
1711         h->watermark    = watermark;
1712 
1713         rcu_read_lock();
1714         h->devs = target_rw_devs(c, BCH_DATA_user, target);
1715 
1716         for_each_member_device_rcu(c, ca, &h->devs)
1717                 if (!ca->mi.durability)
1718                         __clear_bit(ca->dev_idx, h->devs.d);
1719 
1720         h->blocksize = pick_blocksize(c, &h->devs);
1721 
1722         for_each_member_device_rcu(c, ca, &h->devs)
1723                 if (ca->mi.bucket_size == h->blocksize)
1724                         h->nr_active_devs++;
1725 
1726         rcu_read_unlock();
1727 
1728         /*
1729          * If we only have redundancy + 1 devices, we're better off with just
1730          * replication:
1731          */
1732         if (h->nr_active_devs < h->redundancy + 2)
1733                 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
1734                         h->nr_active_devs, h->redundancy + 2);
1735 
1736         list_add(&h->list, &c->ec_stripe_head_list);
1737         return h;
1738 }
1739 
1740 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1741 {
1742         if (h->s &&
1743             h->s->allocated &&
1744             bitmap_weight(h->s->blocks_allocated,
1745                           h->s->nr_data) == h->s->nr_data)
1746                 ec_stripe_set_pending(c, h);
1747 
1748         mutex_unlock(&h->lock);
1749 }
1750 
1751 static struct ec_stripe_head *
1752 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1753                           unsigned target,
1754                           unsigned algo,
1755                           unsigned redundancy,
1756                           enum bch_watermark watermark)
1757 {
1758         struct bch_fs *c = trans->c;
1759         struct ec_stripe_head *h;
1760         int ret;
1761 
1762         if (!redundancy)
1763                 return NULL;
1764 
1765         ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1766         if (ret)
1767                 return ERR_PTR(ret);
1768 
1769         if (test_bit(BCH_FS_going_ro, &c->flags)) {
1770                 h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1771                 goto found;
1772         }
1773 
1774         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1775                 if (h->target           == target &&
1776                     h->algo             == algo &&
1777                     h->redundancy       == redundancy &&
1778                     h->watermark        == watermark) {
1779                         ret = bch2_trans_mutex_lock(trans, &h->lock);
1780                         if (ret)
1781                                 h = ERR_PTR(ret);
1782                         goto found;
1783                 }
1784 
1785         h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
1786 found:
1787         if (!IS_ERR_OR_NULL(h) &&
1788             h->nr_active_devs < h->redundancy + 2) {
1789                 mutex_unlock(&h->lock);
1790                 h = NULL;
1791         }
1792         mutex_unlock(&c->ec_stripe_head_lock);
1793         return h;
1794 }
1795 
1796 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1797                                     enum bch_watermark watermark, struct closure *cl)
1798 {
1799         struct bch_fs *c = trans->c;
1800         struct bch_devs_mask devs = h->devs;
1801         struct open_bucket *ob;
1802         struct open_buckets buckets;
1803         struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1804         unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1805         bool have_cache = true;
1806         int ret = 0;
1807 
1808         BUG_ON(v->nr_blocks     != h->s->nr_data + h->s->nr_parity);
1809         BUG_ON(v->nr_redundant  != h->s->nr_parity);
1810 
1811         /* * We bypass the sector allocator which normally does this: */
1812         bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
1813 
1814         for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1815                 __clear_bit(v->ptrs[i].dev, devs.d);
1816                 if (i < h->s->nr_data)
1817                         nr_have_data++;
1818                 else
1819                         nr_have_parity++;
1820         }
1821 
1822         BUG_ON(nr_have_data     > h->s->nr_data);
1823         BUG_ON(nr_have_parity   > h->s->nr_parity);
1824 
1825         buckets.nr = 0;
1826         if (nr_have_parity < h->s->nr_parity) {
1827                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1828                                             &h->parity_stripe,
1829                                             &devs,
1830                                             h->s->nr_parity,
1831                                             &nr_have_parity,
1832                                             &have_cache, 0,
1833                                             BCH_DATA_parity,
1834                                             watermark,
1835                                             cl);
1836 
1837                 open_bucket_for_each(c, &buckets, ob, i) {
1838                         j = find_next_zero_bit(h->s->blocks_gotten,
1839                                                h->s->nr_data + h->s->nr_parity,
1840                                                h->s->nr_data);
1841                         BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1842 
1843                         h->s->blocks[j] = buckets.v[i];
1844                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1845                         __set_bit(j, h->s->blocks_gotten);
1846                 }
1847 
1848                 if (ret)
1849                         return ret;
1850         }
1851 
1852         buckets.nr = 0;
1853         if (nr_have_data < h->s->nr_data) {
1854                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1855                                             &h->block_stripe,
1856                                             &devs,
1857                                             h->s->nr_data,
1858                                             &nr_have_data,
1859                                             &have_cache, 0,
1860                                             BCH_DATA_user,
1861                                             watermark,
1862                                             cl);
1863 
1864                 open_bucket_for_each(c, &buckets, ob, i) {
1865                         j = find_next_zero_bit(h->s->blocks_gotten,
1866                                                h->s->nr_data, 0);
1867                         BUG_ON(j >= h->s->nr_data);
1868 
1869                         h->s->blocks[j] = buckets.v[i];
1870                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1871                         __set_bit(j, h->s->blocks_gotten);
1872                 }
1873 
1874                 if (ret)
1875                         return ret;
1876         }
1877 
1878         return 0;
1879 }
1880 
1881 /* XXX: doesn't obey target: */
1882 static s64 get_existing_stripe(struct bch_fs *c,
1883                                struct ec_stripe_head *head)
1884 {
1885         ec_stripes_heap *h = &c->ec_stripes_heap;
1886         struct stripe *m;
1887         size_t heap_idx;
1888         u64 stripe_idx;
1889         s64 ret = -1;
1890 
1891         if (may_create_new_stripe(c))
1892                 return -1;
1893 
1894         mutex_lock(&c->ec_stripes_heap_lock);
1895         for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
1896                 /* No blocks worth reusing, stripe will just be deleted: */
1897                 if (!h->data[heap_idx].blocks_nonempty)
1898                         continue;
1899 
1900                 stripe_idx = h->data[heap_idx].idx;
1901 
1902                 m = genradix_ptr(&c->stripes, stripe_idx);
1903 
1904                 if (m->algorithm        == head->algo &&
1905                     m->nr_redundant     == head->redundancy &&
1906                     m->sectors          == head->blocksize &&
1907                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant &&
1908                     bch2_try_open_stripe(c, head->s, stripe_idx)) {
1909                         ret = stripe_idx;
1910                         break;
1911                 }
1912         }
1913         mutex_unlock(&c->ec_stripes_heap_lock);
1914         return ret;
1915 }
1916 
1917 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
1918 {
1919         struct bch_fs *c = trans->c;
1920         struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1921         struct bch_stripe *existing_v;
1922         unsigned i;
1923         s64 idx;
1924         int ret;
1925 
1926         /*
1927          * If we can't allocate a new stripe, and there's no stripes with empty
1928          * blocks for us to reuse, that means we have to wait on copygc:
1929          */
1930         idx = get_existing_stripe(c, h);
1931         if (idx < 0)
1932                 return -BCH_ERR_stripe_alloc_blocked;
1933 
1934         ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
1935         bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
1936                              "reading stripe key: %s", bch2_err_str(ret));
1937         if (ret) {
1938                 bch2_stripe_close(c, h->s);
1939                 return ret;
1940         }
1941 
1942         existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
1943 
1944         BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
1945         h->s->nr_data = existing_v->nr_blocks -
1946                 existing_v->nr_redundant;
1947 
1948         ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
1949         if (ret) {
1950                 bch2_stripe_close(c, h->s);
1951                 return ret;
1952         }
1953 
1954         BUG_ON(h->s->existing_stripe.size != h->blocksize);
1955         BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
1956 
1957         /*
1958          * Free buckets we initially allocated - they might conflict with
1959          * blocks from the stripe we're reusing:
1960          */
1961         for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
1962                 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
1963                 h->s->blocks[i] = 0;
1964         }
1965         memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
1966         memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
1967 
1968         for (i = 0; i < existing_v->nr_blocks; i++) {
1969                 if (stripe_blockcount_get(existing_v, i)) {
1970                         __set_bit(i, h->s->blocks_gotten);
1971                         __set_bit(i, h->s->blocks_allocated);
1972                 }
1973 
1974                 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
1975         }
1976 
1977         bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
1978         h->s->have_existing_stripe = true;
1979 
1980         return 0;
1981 }
1982 
1983 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
1984 {
1985         struct bch_fs *c = trans->c;
1986         struct btree_iter iter;
1987         struct bkey_s_c k;
1988         struct bpos min_pos = POS(0, 1);
1989         struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
1990         int ret;
1991 
1992         if (!h->s->res.sectors) {
1993                 ret = bch2_disk_reservation_get(c, &h->s->res,
1994                                         h->blocksize,
1995                                         h->s->nr_parity,
1996                                         BCH_DISK_RESERVATION_NOFAIL);
1997                 if (ret)
1998                         return ret;
1999         }
2000 
2001         for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
2002                            BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
2003                 if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
2004                         if (start_pos.offset) {
2005                                 start_pos = min_pos;
2006                                 bch2_btree_iter_set_pos(&iter, start_pos);
2007                                 continue;
2008                         }
2009 
2010                         ret = -BCH_ERR_ENOSPC_stripe_create;
2011                         break;
2012                 }
2013 
2014                 if (bkey_deleted(k.k) &&
2015                     bch2_try_open_stripe(c, h->s, k.k->p.offset))
2016                         break;
2017         }
2018 
2019         c->ec_stripe_hint = iter.pos.offset;
2020 
2021         if (ret)
2022                 goto err;
2023 
2024         ret = ec_stripe_mem_alloc(trans, &iter);
2025         if (ret) {
2026                 bch2_stripe_close(c, h->s);
2027                 goto err;
2028         }
2029 
2030         h->s->new_stripe.key.k.p = iter.pos;
2031 out:
2032         bch2_trans_iter_exit(trans, &iter);
2033         return ret;
2034 err:
2035         bch2_disk_reservation_put(c, &h->s->res);
2036         goto out;
2037 }
2038 
2039 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
2040                                                unsigned target,
2041                                                unsigned algo,
2042                                                unsigned redundancy,
2043                                                enum bch_watermark watermark,
2044                                                struct closure *cl)
2045 {
2046         struct bch_fs *c = trans->c;
2047         struct ec_stripe_head *h;
2048         bool waiting = false;
2049         int ret;
2050 
2051         h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
2052         if (IS_ERR_OR_NULL(h))
2053                 return h;
2054 
2055         if (!h->s) {
2056                 ret = ec_new_stripe_alloc(c, h);
2057                 if (ret) {
2058                         bch_err(c, "failed to allocate new stripe");
2059                         goto err;
2060                 }
2061         }
2062 
2063         if (h->s->allocated)
2064                 goto allocated;
2065 
2066         if (h->s->have_existing_stripe)
2067                 goto alloc_existing;
2068 
2069         /* First, try to allocate a full stripe: */
2070         ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
2071                 __bch2_ec_stripe_head_reserve(trans, h);
2072         if (!ret)
2073                 goto allocate_buf;
2074         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2075             bch2_err_matches(ret, ENOMEM))
2076                 goto err;
2077 
2078         /*
2079          * Not enough buckets available for a full stripe: we must reuse an
2080          * existing stripe:
2081          */
2082         while (1) {
2083                 ret = __bch2_ec_stripe_head_reuse(trans, h);
2084                 if (!ret)
2085                         break;
2086                 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2087                         goto err;
2088 
2089                 if (watermark == BCH_WATERMARK_copygc) {
2090                         ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
2091                                 __bch2_ec_stripe_head_reserve(trans, h);
2092                         if (ret)
2093                                 goto err;
2094                         goto allocate_buf;
2095                 }
2096 
2097                 /* XXX freelist_wait? */
2098                 closure_wait(&c->freelist_wait, cl);
2099                 waiting = true;
2100         }
2101 
2102         if (waiting)
2103                 closure_wake_up(&c->freelist_wait);
2104 alloc_existing:
2105         /*
2106          * Retry allocating buckets, with the watermark for this
2107          * particular write:
2108          */
2109         ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
2110         if (ret)
2111                 goto err;
2112 
2113 allocate_buf:
2114         ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
2115         if (ret)
2116                 goto err;
2117 
2118         h->s->allocated = true;
2119 allocated:
2120         BUG_ON(!h->s->idx);
2121         BUG_ON(!h->s->new_stripe.data[0]);
2122         BUG_ON(trans->restarted);
2123         return h;
2124 err:
2125         bch2_ec_stripe_head_put(c, h);
2126         return ERR_PTR(ret);
2127 }
2128 
2129 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2130 {
2131         struct ec_stripe_head *h;
2132         struct open_bucket *ob;
2133         unsigned i;
2134 
2135         mutex_lock(&c->ec_stripe_head_lock);
2136         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2137                 mutex_lock(&h->lock);
2138                 if (!h->s)
2139                         goto unlock;
2140 
2141                 if (!ca)
2142                         goto found;
2143 
2144                 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2145                         if (!h->s->blocks[i])
2146                                 continue;
2147 
2148                         ob = c->open_buckets + h->s->blocks[i];
2149                         if (ob->dev == ca->dev_idx)
2150                                 goto found;
2151                 }
2152                 goto unlock;
2153 found:
2154                 h->s->err = -BCH_ERR_erofs_no_writes;
2155                 ec_stripe_set_pending(c, h);
2156 unlock:
2157                 mutex_unlock(&h->lock);
2158         }
2159         mutex_unlock(&c->ec_stripe_head_lock);
2160 }
2161 
2162 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2163 {
2164         __bch2_ec_stop(c, ca);
2165 }
2166 
2167 void bch2_fs_ec_stop(struct bch_fs *c)
2168 {
2169         __bch2_ec_stop(c, NULL);
2170 }
2171 
2172 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2173 {
2174         bool ret;
2175 
2176         mutex_lock(&c->ec_stripe_new_lock);
2177         ret = list_empty(&c->ec_stripe_new_list);
2178         mutex_unlock(&c->ec_stripe_new_lock);
2179 
2180         return ret;
2181 }
2182 
2183 void bch2_fs_ec_flush(struct bch_fs *c)
2184 {
2185         wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2186 }
2187 
2188 int bch2_stripes_read(struct bch_fs *c)
2189 {
2190         int ret = bch2_trans_run(c,
2191                 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
2192                                    BTREE_ITER_prefetch, k, ({
2193                         if (k.k->type != KEY_TYPE_stripe)
2194                                 continue;
2195 
2196                         ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
2197                         if (ret)
2198                                 break;
2199 
2200                         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
2201 
2202                         struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
2203                         m->sectors      = le16_to_cpu(s->sectors);
2204                         m->algorithm    = s->algorithm;
2205                         m->nr_blocks    = s->nr_blocks;
2206                         m->nr_redundant = s->nr_redundant;
2207                         m->blocks_nonempty = 0;
2208 
2209                         for (unsigned i = 0; i < s->nr_blocks; i++)
2210                                 m->blocks_nonempty += !!stripe_blockcount_get(s, i);
2211 
2212                         bch2_stripes_heap_insert(c, m, k.k->p.offset);
2213                         0;
2214                 })));
2215         bch_err_fn(c, ret);
2216         return ret;
2217 }
2218 
2219 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
2220 {
2221         ec_stripes_heap *h = &c->ec_stripes_heap;
2222         struct stripe *m;
2223         size_t i;
2224 
2225         mutex_lock(&c->ec_stripes_heap_lock);
2226         for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
2227                 m = genradix_ptr(&c->stripes, h->data[i].idx);
2228 
2229                 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
2230                        h->data[i].blocks_nonempty,
2231                        m->nr_blocks - m->nr_redundant,
2232                        m->nr_redundant);
2233                 if (bch2_stripe_is_open(c, h->data[i].idx))
2234                         prt_str(out, " open");
2235                 prt_newline(out);
2236         }
2237         mutex_unlock(&c->ec_stripes_heap_lock);
2238 }
2239 
2240 static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
2241                                     struct ec_stripe_new *s)
2242 {
2243         prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
2244                    s->idx, s->nr_data, s->nr_parity,
2245                    bitmap_weight(s->blocks_allocated, s->nr_data),
2246                    atomic_read(&s->ref[STRIPE_REF_io]),
2247                    atomic_read(&s->ref[STRIPE_REF_stripe]),
2248                    bch2_watermarks[s->h->watermark]);
2249 
2250         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
2251         unsigned i;
2252         for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
2253                 prt_printf(out, " %u", s->blocks[i]);
2254         prt_newline(out);
2255 }
2256 
2257 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2258 {
2259         struct ec_stripe_head *h;
2260         struct ec_stripe_new *s;
2261 
2262         mutex_lock(&c->ec_stripe_head_lock);
2263         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2264                 prt_printf(out, "target %u algo %u redundancy %u %s:\n",
2265                        h->target, h->algo, h->redundancy,
2266                        bch2_watermarks[h->watermark]);
2267 
2268                 if (h->s)
2269                         bch2_new_stripe_to_text(out, c, h->s);
2270         }
2271         mutex_unlock(&c->ec_stripe_head_lock);
2272 
2273         prt_printf(out, "in flight:\n");
2274 
2275         mutex_lock(&c->ec_stripe_new_lock);
2276         list_for_each_entry(s, &c->ec_stripe_new_list, list)
2277                 bch2_new_stripe_to_text(out, c, s);
2278         mutex_unlock(&c->ec_stripe_new_lock);
2279 }
2280 
2281 void bch2_fs_ec_exit(struct bch_fs *c)
2282 {
2283         struct ec_stripe_head *h;
2284         unsigned i;
2285 
2286         while (1) {
2287                 mutex_lock(&c->ec_stripe_head_lock);
2288                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
2289                                              struct ec_stripe_head, list);
2290                 if (h)
2291                         list_del(&h->list);
2292                 mutex_unlock(&c->ec_stripe_head_lock);
2293                 if (!h)
2294                         break;
2295 
2296                 if (h->s) {
2297                         for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2298                                 BUG_ON(h->s->blocks[i]);
2299 
2300                         kfree(h->s);
2301                 }
2302                 kfree(h);
2303         }
2304 
2305         BUG_ON(!list_empty(&c->ec_stripe_new_list));
2306 
2307         free_heap(&c->ec_stripes_heap);
2308         genradix_free(&c->stripes);
2309         bioset_exit(&c->ec_bioset);
2310 }
2311 
2312 void bch2_fs_ec_init_early(struct bch_fs *c)
2313 {
2314         spin_lock_init(&c->ec_stripes_new_lock);
2315         mutex_init(&c->ec_stripes_heap_lock);
2316 
2317         INIT_LIST_HEAD(&c->ec_stripe_head_list);
2318         mutex_init(&c->ec_stripe_head_lock);
2319 
2320         INIT_LIST_HEAD(&c->ec_stripe_new_list);
2321         mutex_init(&c->ec_stripe_new_lock);
2322         init_waitqueue_head(&c->ec_stripe_new_wait);
2323 
2324         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2325         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2326 }
2327 
2328 int bch2_fs_ec_init(struct bch_fs *c)
2329 {
2330         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2331                            BIOSET_NEED_BVECS);
2332 }
2333 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php