~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/move.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include "bcachefs.h"
  4 #include "alloc_background.h"
  5 #include "alloc_foreground.h"
  6 #include "backpointers.h"
  7 #include "bkey_buf.h"
  8 #include "btree_gc.h"
  9 #include "btree_io.h"
 10 #include "btree_update.h"
 11 #include "btree_update_interior.h"
 12 #include "btree_write_buffer.h"
 13 #include "compress.h"
 14 #include "disk_groups.h"
 15 #include "ec.h"
 16 #include "errcode.h"
 17 #include "error.h"
 18 #include "inode.h"
 19 #include "io_read.h"
 20 #include "io_write.h"
 21 #include "journal_reclaim.h"
 22 #include "keylist.h"
 23 #include "move.h"
 24 #include "replicas.h"
 25 #include "snapshot.h"
 26 #include "super-io.h"
 27 #include "trace.h"
 28 
 29 #include <linux/ioprio.h>
 30 #include <linux/kthread.h>
 31 
 32 const char * const bch2_data_ops_strs[] = {
 33 #define x(t, n, ...) [n] = #t,
 34         BCH_DATA_OPS()
 35 #undef x
 36         NULL
 37 };
 38 
 39 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
 40                                struct bch_io_opts *io_opts,
 41                                struct data_update_opts *data_opts)
 42 {
 43         if (trace_move_extent_enabled()) {
 44                 struct printbuf buf = PRINTBUF;
 45 
 46                 bch2_bkey_val_to_text(&buf, c, k);
 47                 prt_newline(&buf);
 48                 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
 49                 trace_move_extent(c, buf.buf);
 50                 printbuf_exit(&buf);
 51         }
 52 }
 53 
 54 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
 55 {
 56         if (trace_move_extent_read_enabled()) {
 57                 struct printbuf buf = PRINTBUF;
 58 
 59                 bch2_bkey_val_to_text(&buf, c, k);
 60                 trace_move_extent_read(c, buf.buf);
 61                 printbuf_exit(&buf);
 62         }
 63 }
 64 
 65 struct moving_io {
 66         struct list_head                read_list;
 67         struct list_head                io_list;
 68         struct move_bucket_in_flight    *b;
 69         struct closure                  cl;
 70         bool                            read_completed;
 71 
 72         unsigned                        read_sectors;
 73         unsigned                        write_sectors;
 74 
 75         struct bch_read_bio             rbio;
 76 
 77         struct data_update              write;
 78         /* Must be last since it is variable size */
 79         struct bio_vec                  bi_inline_vecs[];
 80 };
 81 
 82 static void move_free(struct moving_io *io)
 83 {
 84         struct moving_context *ctxt = io->write.ctxt;
 85 
 86         if (io->b)
 87                 atomic_dec(&io->b->count);
 88 
 89         bch2_data_update_exit(&io->write);
 90 
 91         mutex_lock(&ctxt->lock);
 92         list_del(&io->io_list);
 93         wake_up(&ctxt->wait);
 94         mutex_unlock(&ctxt->lock);
 95 
 96         kfree(io);
 97 }
 98 
 99 static void move_write_done(struct bch_write_op *op)
100 {
101         struct moving_io *io = container_of(op, struct moving_io, write.op);
102         struct moving_context *ctxt = io->write.ctxt;
103 
104         if (io->write.op.error)
105                 ctxt->write_error = true;
106 
107         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
108         atomic_dec(&io->write.ctxt->write_ios);
109         move_free(io);
110         closure_put(&ctxt->cl);
111 }
112 
113 static void move_write(struct moving_io *io)
114 {
115         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
116                 move_free(io);
117                 return;
118         }
119 
120         if (trace_move_extent_write_enabled()) {
121                 struct bch_fs *c = io->write.op.c;
122                 struct printbuf buf = PRINTBUF;
123 
124                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
125                 trace_move_extent_write(c, buf.buf);
126                 printbuf_exit(&buf);
127         }
128 
129         closure_get(&io->write.ctxt->cl);
130         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
131         atomic_inc(&io->write.ctxt->write_ios);
132 
133         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
134 }
135 
136 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
137 {
138         struct moving_io *io =
139                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
140 
141         return io && io->read_completed ? io : NULL;
142 }
143 
144 static void move_read_endio(struct bio *bio)
145 {
146         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
147         struct moving_context *ctxt = io->write.ctxt;
148 
149         atomic_sub(io->read_sectors, &ctxt->read_sectors);
150         atomic_dec(&ctxt->read_ios);
151         io->read_completed = true;
152 
153         wake_up(&ctxt->wait);
154         closure_put(&ctxt->cl);
155 }
156 
157 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
158 {
159         struct moving_io *io;
160 
161         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
162                 bch2_trans_unlock_long(ctxt->trans);
163                 list_del(&io->read_list);
164                 move_write(io);
165         }
166 }
167 
168 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
169 {
170         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
171 
172         move_ctxt_wait_event(ctxt,
173                 !atomic_read(&ctxt->write_sectors) ||
174                 atomic_read(&ctxt->write_sectors) != sectors_pending);
175 }
176 
177 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
178 {
179         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
180         bch2_trans_unlock_long(ctxt->trans);
181         closure_sync(&ctxt->cl);
182 }
183 
184 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
185 {
186         struct bch_fs *c = ctxt->trans->c;
187 
188         bch2_moving_ctxt_flush_all(ctxt);
189 
190         EBUG_ON(atomic_read(&ctxt->write_sectors));
191         EBUG_ON(atomic_read(&ctxt->write_ios));
192         EBUG_ON(atomic_read(&ctxt->read_sectors));
193         EBUG_ON(atomic_read(&ctxt->read_ios));
194 
195         mutex_lock(&c->moving_context_lock);
196         list_del(&ctxt->list);
197         mutex_unlock(&c->moving_context_lock);
198 
199         bch2_trans_put(ctxt->trans);
200         memset(ctxt, 0, sizeof(*ctxt));
201 }
202 
203 void bch2_moving_ctxt_init(struct moving_context *ctxt,
204                            struct bch_fs *c,
205                            struct bch_ratelimit *rate,
206                            struct bch_move_stats *stats,
207                            struct write_point_specifier wp,
208                            bool wait_on_copygc)
209 {
210         memset(ctxt, 0, sizeof(*ctxt));
211 
212         ctxt->trans     = bch2_trans_get(c);
213         ctxt->fn        = (void *) _RET_IP_;
214         ctxt->rate      = rate;
215         ctxt->stats     = stats;
216         ctxt->wp        = wp;
217         ctxt->wait_on_copygc = wait_on_copygc;
218 
219         closure_init_stack(&ctxt->cl);
220 
221         mutex_init(&ctxt->lock);
222         INIT_LIST_HEAD(&ctxt->reads);
223         INIT_LIST_HEAD(&ctxt->ios);
224         init_waitqueue_head(&ctxt->wait);
225 
226         mutex_lock(&c->moving_context_lock);
227         list_add(&ctxt->list, &c->moving_context_list);
228         mutex_unlock(&c->moving_context_lock);
229 }
230 
231 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
232 {
233         trace_move_data(c, stats);
234 }
235 
236 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
237 {
238         memset(stats, 0, sizeof(*stats));
239         stats->data_type = BCH_DATA_user;
240         scnprintf(stats->name, sizeof(stats->name), "%s", name);
241 }
242 
243 int bch2_move_extent(struct moving_context *ctxt,
244                      struct move_bucket_in_flight *bucket_in_flight,
245                      struct btree_iter *iter,
246                      struct bkey_s_c k,
247                      struct bch_io_opts io_opts,
248                      struct data_update_opts data_opts)
249 {
250         struct btree_trans *trans = ctxt->trans;
251         struct bch_fs *c = trans->c;
252         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
253         struct moving_io *io;
254         const union bch_extent_entry *entry;
255         struct extent_ptr_decoded p;
256         unsigned sectors = k.k->size, pages;
257         int ret = -ENOMEM;
258 
259         trace_move_extent2(c, k, &io_opts, &data_opts);
260 
261         if (ctxt->stats)
262                 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
263 
264         bch2_data_update_opts_normalize(k, &data_opts);
265 
266         if (!data_opts.rewrite_ptrs &&
267             !data_opts.extra_replicas) {
268                 if (data_opts.kill_ptrs)
269                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
270                 return 0;
271         }
272 
273         /*
274          * Before memory allocations & taking nocow locks in
275          * bch2_data_update_init():
276          */
277         bch2_trans_unlock(trans);
278 
279         /* write path might have to decompress data: */
280         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
281                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
282 
283         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
284         io = kzalloc(sizeof(struct moving_io) +
285                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
286         if (!io)
287                 goto err;
288 
289         INIT_LIST_HEAD(&io->io_list);
290         io->write.ctxt          = ctxt;
291         io->read_sectors        = k.k->size;
292         io->write_sectors       = k.k->size;
293 
294         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
295         bio_set_prio(&io->write.op.wbio.bio,
296                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
297 
298         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
299                                  GFP_KERNEL))
300                 goto err_free;
301 
302         io->rbio.c              = c;
303         io->rbio.opts           = io_opts;
304         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
305         io->rbio.bio.bi_vcnt = pages;
306         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
307         io->rbio.bio.bi_iter.bi_size = sectors << 9;
308 
309         io->rbio.bio.bi_opf             = REQ_OP_READ;
310         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
311         io->rbio.bio.bi_end_io          = move_read_endio;
312 
313         ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
314                                     io_opts, data_opts, iter->btree_id, k);
315         if (ret)
316                 goto err_free_pages;
317 
318         io->write.op.end_io = move_write_done;
319 
320         if (ctxt->rate)
321                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
322 
323         if (ctxt->stats) {
324                 atomic64_inc(&ctxt->stats->keys_moved);
325                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
326         }
327 
328         if (bucket_in_flight) {
329                 io->b = bucket_in_flight;
330                 atomic_inc(&io->b->count);
331         }
332 
333         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
334         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
335         trace_move_extent_read2(c, k);
336 
337         mutex_lock(&ctxt->lock);
338         atomic_add(io->read_sectors, &ctxt->read_sectors);
339         atomic_inc(&ctxt->read_ios);
340 
341         list_add_tail(&io->read_list, &ctxt->reads);
342         list_add_tail(&io->io_list, &ctxt->ios);
343         mutex_unlock(&ctxt->lock);
344 
345         /*
346          * dropped by move_read_endio() - guards against use after free of
347          * ctxt when doing wakeup
348          */
349         closure_get(&ctxt->cl);
350         bch2_read_extent(trans, &io->rbio,
351                          bkey_start_pos(k.k),
352                          iter->btree_id, k, 0,
353                          BCH_READ_NODECODE|
354                          BCH_READ_LAST_FRAGMENT);
355         return 0;
356 err_free_pages:
357         bio_free_pages(&io->write.op.wbio.bio);
358 err_free:
359         kfree(io);
360 err:
361         if (ret == -BCH_ERR_data_update_done)
362                 return 0;
363 
364         if (bch2_err_matches(ret, EROFS) ||
365             bch2_err_matches(ret, BCH_ERR_transaction_restart))
366                 return ret;
367 
368         count_event(c, move_extent_start_fail);
369 
370         if (trace_move_extent_start_fail_enabled()) {
371                 struct printbuf buf = PRINTBUF;
372 
373                 bch2_bkey_val_to_text(&buf, c, k);
374                 prt_str(&buf, ": ");
375                 prt_str(&buf, bch2_err_str(ret));
376                 trace_move_extent_start_fail(c, buf.buf);
377                 printbuf_exit(&buf);
378         }
379         return ret;
380 }
381 
382 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
383                           struct per_snapshot_io_opts *io_opts,
384                           struct bkey_s_c extent_k)
385 {
386         struct bch_fs *c = trans->c;
387         u32 restart_count = trans->restart_count;
388         int ret = 0;
389 
390         if (io_opts->cur_inum != extent_k.k->p.inode) {
391                 io_opts->d.nr = 0;
392 
393                 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
394                                          BTREE_ITER_all_snapshots, k, ({
395                         if (k.k->p.offset != extent_k.k->p.inode)
396                                 break;
397 
398                         if (!bkey_is_inode(k.k))
399                                 continue;
400 
401                         struct bch_inode_unpacked inode;
402                         BUG_ON(bch2_inode_unpack(k, &inode));
403 
404                         struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
405                         bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
406 
407                         darray_push(&io_opts->d, e);
408                 }));
409                 io_opts->cur_inum = extent_k.k->p.inode;
410         }
411 
412         ret = ret ?: trans_was_restarted(trans, restart_count);
413         if (ret)
414                 return ERR_PTR(ret);
415 
416         if (extent_k.k->p.snapshot)
417                 darray_for_each(io_opts->d, i)
418                         if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
419                                 return &i->io_opts;
420 
421         return &io_opts->fs_io_opts;
422 }
423 
424 int bch2_move_get_io_opts_one(struct btree_trans *trans,
425                               struct bch_io_opts *io_opts,
426                               struct bkey_s_c extent_k)
427 {
428         struct btree_iter iter;
429         struct bkey_s_c k;
430         int ret;
431 
432         /* reflink btree? */
433         if (!extent_k.k->p.inode) {
434                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
435                 return 0;
436         }
437 
438         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
439                                SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
440                                BTREE_ITER_cached);
441         ret = bkey_err(k);
442         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
443                 return ret;
444 
445         if (!ret && bkey_is_inode(k.k)) {
446                 struct bch_inode_unpacked inode;
447                 bch2_inode_unpack(k, &inode);
448                 bch2_inode_opts_get(io_opts, trans->c, &inode);
449         } else {
450                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
451         }
452 
453         bch2_trans_iter_exit(trans, &iter);
454         return 0;
455 }
456 
457 int bch2_move_ratelimit(struct moving_context *ctxt)
458 {
459         struct bch_fs *c = ctxt->trans->c;
460         bool is_kthread = current->flags & PF_KTHREAD;
461         u64 delay;
462 
463         if (ctxt->wait_on_copygc && c->copygc_running) {
464                 bch2_moving_ctxt_flush_all(ctxt);
465                 wait_event_killable(c->copygc_running_wq,
466                                     !c->copygc_running ||
467                                     (is_kthread && kthread_should_stop()));
468         }
469 
470         do {
471                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
472 
473                 if (is_kthread && kthread_should_stop())
474                         return 1;
475 
476                 if (delay)
477                         move_ctxt_wait_event_timeout(ctxt,
478                                         freezing(current) ||
479                                         (is_kthread && kthread_should_stop()),
480                                         delay);
481 
482                 if (unlikely(freezing(current))) {
483                         bch2_moving_ctxt_flush_all(ctxt);
484                         try_to_freeze();
485                 }
486         } while (delay);
487 
488         /*
489          * XXX: these limits really ought to be per device, SSDs and hard drives
490          * will want different limits
491          */
492         move_ctxt_wait_event(ctxt,
493                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
494                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
495                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
496                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
497 
498         return 0;
499 }
500 
501 static int bch2_move_data_btree(struct moving_context *ctxt,
502                                 struct bpos start,
503                                 struct bpos end,
504                                 move_pred_fn pred, void *arg,
505                                 enum btree_id btree_id)
506 {
507         struct btree_trans *trans = ctxt->trans;
508         struct bch_fs *c = trans->c;
509         struct per_snapshot_io_opts snapshot_io_opts;
510         struct bch_io_opts *io_opts;
511         struct bkey_buf sk;
512         struct btree_iter iter;
513         struct bkey_s_c k;
514         struct data_update_opts data_opts;
515         int ret = 0, ret2;
516 
517         per_snapshot_io_opts_init(&snapshot_io_opts, c);
518         bch2_bkey_buf_init(&sk);
519 
520         if (ctxt->stats) {
521                 ctxt->stats->data_type  = BCH_DATA_user;
522                 ctxt->stats->pos        = BBPOS(btree_id, start);
523         }
524 
525         bch2_trans_begin(trans);
526         bch2_trans_iter_init(trans, &iter, btree_id, start,
527                              BTREE_ITER_prefetch|
528                              BTREE_ITER_all_snapshots);
529 
530         if (ctxt->rate)
531                 bch2_ratelimit_reset(ctxt->rate);
532 
533         while (!bch2_move_ratelimit(ctxt)) {
534                 bch2_trans_begin(trans);
535 
536                 k = bch2_btree_iter_peek(&iter);
537                 if (!k.k)
538                         break;
539 
540                 ret = bkey_err(k);
541                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
542                         continue;
543                 if (ret)
544                         break;
545 
546                 if (bkey_ge(bkey_start_pos(k.k), end))
547                         break;
548 
549                 if (ctxt->stats)
550                         ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
551 
552                 if (!bkey_extent_is_direct_data(k.k))
553                         goto next_nondata;
554 
555                 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
556                 ret = PTR_ERR_OR_ZERO(io_opts);
557                 if (ret)
558                         continue;
559 
560                 memset(&data_opts, 0, sizeof(data_opts));
561                 if (!pred(c, arg, k, io_opts, &data_opts))
562                         goto next;
563 
564                 /*
565                  * The iterator gets unlocked by __bch2_read_extent - need to
566                  * save a copy of @k elsewhere:
567                  */
568                 bch2_bkey_buf_reassemble(&sk, c, k);
569                 k = bkey_i_to_s_c(sk.k);
570 
571                 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
572                 if (ret2) {
573                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
574                                 continue;
575 
576                         if (ret2 == -ENOMEM) {
577                                 /* memory allocation failure, wait for some IO to finish */
578                                 bch2_move_ctxt_wait_for_io(ctxt);
579                                 continue;
580                         }
581 
582                         /* XXX signal failure */
583                         goto next;
584                 }
585 next:
586                 if (ctxt->stats)
587                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
588 next_nondata:
589                 bch2_btree_iter_advance(&iter);
590         }
591 
592         bch2_trans_iter_exit(trans, &iter);
593         bch2_bkey_buf_exit(&sk, c);
594         per_snapshot_io_opts_exit(&snapshot_io_opts);
595 
596         return ret;
597 }
598 
599 int __bch2_move_data(struct moving_context *ctxt,
600                      struct bbpos start,
601                      struct bbpos end,
602                      move_pred_fn pred, void *arg)
603 {
604         struct bch_fs *c = ctxt->trans->c;
605         enum btree_id id;
606         int ret = 0;
607 
608         for (id = start.btree;
609              id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
610              id++) {
611                 ctxt->stats->pos = BBPOS(id, POS_MIN);
612 
613                 if (!btree_type_has_ptrs(id) ||
614                     !bch2_btree_id_root(c, id)->b)
615                         continue;
616 
617                 ret = bch2_move_data_btree(ctxt,
618                                        id == start.btree ? start.pos : POS_MIN,
619                                        id == end.btree   ? end.pos   : POS_MAX,
620                                        pred, arg, id);
621                 if (ret)
622                         break;
623         }
624 
625         return ret;
626 }
627 
628 int bch2_move_data(struct bch_fs *c,
629                    struct bbpos start,
630                    struct bbpos end,
631                    struct bch_ratelimit *rate,
632                    struct bch_move_stats *stats,
633                    struct write_point_specifier wp,
634                    bool wait_on_copygc,
635                    move_pred_fn pred, void *arg)
636 {
637 
638         struct moving_context ctxt;
639         int ret;
640 
641         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
642         ret = __bch2_move_data(&ctxt, start, end, pred, arg);
643         bch2_moving_ctxt_exit(&ctxt);
644 
645         return ret;
646 }
647 
648 int bch2_evacuate_bucket(struct moving_context *ctxt,
649                            struct move_bucket_in_flight *bucket_in_flight,
650                            struct bpos bucket, int gen,
651                            struct data_update_opts _data_opts)
652 {
653         struct btree_trans *trans = ctxt->trans;
654         struct bch_fs *c = trans->c;
655         bool is_kthread = current->flags & PF_KTHREAD;
656         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
657         struct btree_iter iter;
658         struct bkey_buf sk;
659         struct bch_backpointer bp;
660         struct bch_alloc_v4 a_convert;
661         const struct bch_alloc_v4 *a;
662         struct bkey_s_c k;
663         struct data_update_opts data_opts;
664         unsigned dirty_sectors, bucket_size;
665         u64 fragmentation;
666         struct bpos bp_pos = POS_MIN;
667         int ret = 0;
668 
669         struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
670         if (!ca)
671                 return 0;
672 
673         trace_bucket_evacuate(c, &bucket);
674 
675         bch2_bkey_buf_init(&sk);
676 
677         /*
678          * We're not run in a context that handles transaction restarts:
679          */
680         bch2_trans_begin(trans);
681 
682         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
683                              bucket, BTREE_ITER_cached);
684         ret = lockrestart_do(trans,
685                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
686         bch2_trans_iter_exit(trans, &iter);
687 
688         bch_err_msg(c, ret, "looking up alloc key");
689         if (ret)
690                 goto err;
691 
692         a = bch2_alloc_to_v4(k, &a_convert);
693         dirty_sectors = bch2_bucket_sectors_dirty(*a);
694         bucket_size = ca->mi.bucket_size;
695         fragmentation = a->fragmentation_lru;
696 
697         ret = bch2_btree_write_buffer_tryflush(trans);
698         bch_err_msg(c, ret, "flushing btree write buffer");
699         if (ret)
700                 goto err;
701 
702         while (!(ret = bch2_move_ratelimit(ctxt))) {
703                 if (is_kthread && kthread_should_stop())
704                         break;
705 
706                 bch2_trans_begin(trans);
707 
708                 ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
709                                                 &bp_pos, &bp,
710                                                 BTREE_ITER_cached);
711                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
712                         continue;
713                 if (ret)
714                         goto err;
715                 if (bkey_eq(bp_pos, POS_MAX))
716                         break;
717 
718                 if (!bp.level) {
719                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
720                         ret = bkey_err(k);
721                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
722                                 continue;
723                         if (ret)
724                                 goto err;
725                         if (!k.k)
726                                 goto next;
727 
728                         bch2_bkey_buf_reassemble(&sk, c, k);
729                         k = bkey_i_to_s_c(sk.k);
730 
731                         ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
732                         if (ret) {
733                                 bch2_trans_iter_exit(trans, &iter);
734                                 continue;
735                         }
736 
737                         data_opts = _data_opts;
738                         data_opts.target        = io_opts.background_target;
739                         data_opts.rewrite_ptrs = 0;
740 
741                         unsigned i = 0;
742                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
743                                 if (ptr->dev == bucket.inode) {
744                                         data_opts.rewrite_ptrs |= 1U << i;
745                                         if (ptr->cached) {
746                                                 bch2_trans_iter_exit(trans, &iter);
747                                                 goto next;
748                                         }
749                                 }
750                                 i++;
751                         }
752 
753                         ret = bch2_move_extent(ctxt, bucket_in_flight,
754                                                &iter, k, io_opts, data_opts);
755                         bch2_trans_iter_exit(trans, &iter);
756 
757                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
758                                 continue;
759                         if (ret == -ENOMEM) {
760                                 /* memory allocation failure, wait for some IO to finish */
761                                 bch2_move_ctxt_wait_for_io(ctxt);
762                                 continue;
763                         }
764                         if (ret)
765                                 goto err;
766 
767                         if (ctxt->stats)
768                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
769                 } else {
770                         struct btree *b;
771 
772                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
773                         ret = PTR_ERR_OR_ZERO(b);
774                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
775                                 continue;
776                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
777                                 continue;
778                         if (ret)
779                                 goto err;
780                         if (!b)
781                                 goto next;
782 
783                         unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
784 
785                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
786                         bch2_trans_iter_exit(trans, &iter);
787 
788                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
789                                 continue;
790                         if (ret)
791                                 goto err;
792 
793                         if (ctxt->rate)
794                                 bch2_ratelimit_increment(ctxt->rate, sectors);
795                         if (ctxt->stats) {
796                                 atomic64_add(sectors, &ctxt->stats->sectors_seen);
797                                 atomic64_add(sectors, &ctxt->stats->sectors_moved);
798                         }
799                 }
800 next:
801                 bp_pos = bpos_nosnap_successor(bp_pos);
802         }
803 
804         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
805 err:
806         bch2_dev_put(ca);
807         bch2_bkey_buf_exit(&sk, c);
808         return ret;
809 }
810 
811 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
812                                 struct btree *, struct bch_io_opts *,
813                                 struct data_update_opts *);
814 
815 static int bch2_move_btree(struct bch_fs *c,
816                            struct bbpos start,
817                            struct bbpos end,
818                            move_btree_pred pred, void *arg,
819                            struct bch_move_stats *stats)
820 {
821         bool kthread = (current->flags & PF_KTHREAD) != 0;
822         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
823         struct moving_context ctxt;
824         struct btree_trans *trans;
825         struct btree_iter iter;
826         struct btree *b;
827         enum btree_id btree;
828         struct data_update_opts data_opts;
829         int ret = 0;
830 
831         bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
832                               writepoint_ptr(&c->btree_write_point),
833                               true);
834         trans = ctxt.trans;
835 
836         stats->data_type = BCH_DATA_btree;
837 
838         for (btree = start.btree;
839              btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
840              btree ++) {
841                 stats->pos = BBPOS(btree, POS_MIN);
842 
843                 if (!bch2_btree_id_root(c, btree)->b)
844                         continue;
845 
846                 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
847                                           BTREE_ITER_prefetch);
848 retry:
849                 ret = 0;
850                 while (bch2_trans_begin(trans),
851                        (b = bch2_btree_iter_peek_node(&iter)) &&
852                        !(ret = PTR_ERR_OR_ZERO(b))) {
853                         if (kthread && kthread_should_stop())
854                                 break;
855 
856                         if ((cmp_int(btree, end.btree) ?:
857                              bpos_cmp(b->key.k.p, end.pos)) > 0)
858                                 break;
859 
860                         stats->pos = BBPOS(iter.btree_id, iter.pos);
861 
862                         if (!pred(c, arg, b, &io_opts, &data_opts))
863                                 goto next;
864 
865                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
866                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
867                                 continue;
868                         if (ret)
869                                 break;
870 next:
871                         bch2_btree_iter_next_node(&iter);
872                 }
873                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
874                         goto retry;
875 
876                 bch2_trans_iter_exit(trans, &iter);
877 
878                 if (kthread && kthread_should_stop())
879                         break;
880         }
881 
882         bch_err_fn(c, ret);
883         bch2_moving_ctxt_exit(&ctxt);
884         bch2_btree_interior_updates_flush(c);
885 
886         return ret;
887 }
888 
889 static bool rereplicate_pred(struct bch_fs *c, void *arg,
890                              struct bkey_s_c k,
891                              struct bch_io_opts *io_opts,
892                              struct data_update_opts *data_opts)
893 {
894         unsigned nr_good = bch2_bkey_durability(c, k);
895         unsigned replicas = bkey_is_btree_ptr(k.k)
896                 ? c->opts.metadata_replicas
897                 : io_opts->data_replicas;
898 
899         rcu_read_lock();
900         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
901         unsigned i = 0;
902         bkey_for_each_ptr(ptrs, ptr) {
903                 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
904                 if (!ptr->cached &&
905                     (!ca || !ca->mi.durability))
906                         data_opts->kill_ptrs |= BIT(i);
907                 i++;
908         }
909         rcu_read_unlock();
910 
911         if (!data_opts->kill_ptrs &&
912             (!nr_good || nr_good >= replicas))
913                 return false;
914 
915         data_opts->target               = 0;
916         data_opts->extra_replicas       = replicas - nr_good;
917         data_opts->btree_insert_flags   = 0;
918         return true;
919 }
920 
921 static bool migrate_pred(struct bch_fs *c, void *arg,
922                          struct bkey_s_c k,
923                          struct bch_io_opts *io_opts,
924                          struct data_update_opts *data_opts)
925 {
926         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
927         struct bch_ioctl_data *op = arg;
928         unsigned i = 0;
929 
930         data_opts->rewrite_ptrs         = 0;
931         data_opts->target               = 0;
932         data_opts->extra_replicas       = 0;
933         data_opts->btree_insert_flags   = 0;
934 
935         bkey_for_each_ptr(ptrs, ptr) {
936                 if (ptr->dev == op->migrate.dev)
937                         data_opts->rewrite_ptrs |= 1U << i;
938                 i++;
939         }
940 
941         return data_opts->rewrite_ptrs != 0;
942 }
943 
944 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
945                                    struct btree *b,
946                                    struct bch_io_opts *io_opts,
947                                    struct data_update_opts *data_opts)
948 {
949         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
950 }
951 
952 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
953                                struct btree *b,
954                                struct bch_io_opts *io_opts,
955                                struct data_update_opts *data_opts)
956 {
957         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
958 }
959 
960 /*
961  * Ancient versions of bcachefs produced packed formats which could represent
962  * keys that the in memory format cannot represent; this checks for those
963  * formats so we can get rid of them.
964  */
965 static bool bformat_needs_redo(struct bkey_format *f)
966 {
967         for (unsigned i = 0; i < f->nr_fields; i++)
968                 if (bch2_bkey_format_field_overflows(f, i))
969                         return true;
970 
971         return false;
972 }
973 
974 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
975                                    struct btree *b,
976                                    struct bch_io_opts *io_opts,
977                                    struct data_update_opts *data_opts)
978 {
979         if (b->version_ondisk != c->sb.version ||
980             btree_node_need_rewrite(b) ||
981             bformat_needs_redo(&b->format)) {
982                 data_opts->target               = 0;
983                 data_opts->extra_replicas       = 0;
984                 data_opts->btree_insert_flags   = 0;
985                 return true;
986         }
987 
988         return false;
989 }
990 
991 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
992 {
993         int ret;
994 
995         ret = bch2_move_btree(c,
996                               BBPOS_MIN,
997                               BBPOS_MAX,
998                               rewrite_old_nodes_pred, c, stats);
999         if (!ret) {
1000                 mutex_lock(&c->sb_lock);
1001                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1002                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1003                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1004                 bch2_write_super(c);
1005                 mutex_unlock(&c->sb_lock);
1006         }
1007 
1008         bch_err_fn(c, ret);
1009         return ret;
1010 }
1011 
1012 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1013                              struct bkey_s_c k,
1014                              struct bch_io_opts *io_opts,
1015                              struct data_update_opts *data_opts)
1016 {
1017         unsigned durability = bch2_bkey_durability(c, k);
1018         unsigned replicas = bkey_is_btree_ptr(k.k)
1019                 ? c->opts.metadata_replicas
1020                 : io_opts->data_replicas;
1021         const union bch_extent_entry *entry;
1022         struct extent_ptr_decoded p;
1023         unsigned i = 0;
1024 
1025         rcu_read_lock();
1026         bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1027                 unsigned d = bch2_extent_ptr_durability(c, &p);
1028 
1029                 if (d && durability - d >= replicas) {
1030                         data_opts->kill_ptrs |= BIT(i);
1031                         durability -= d;
1032                 }
1033 
1034                 i++;
1035         }
1036         rcu_read_unlock();
1037 
1038         return data_opts->kill_ptrs != 0;
1039 }
1040 
1041 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1042                                    struct btree *b,
1043                                    struct bch_io_opts *io_opts,
1044                                    struct data_update_opts *data_opts)
1045 {
1046         return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
1047 }
1048 
1049 int bch2_data_job(struct bch_fs *c,
1050                   struct bch_move_stats *stats,
1051                   struct bch_ioctl_data op)
1052 {
1053         struct bbpos start      = BBPOS(op.start_btree, op.start_pos);
1054         struct bbpos end        = BBPOS(op.end_btree, op.end_pos);
1055         int ret = 0;
1056 
1057         if (op.op >= BCH_DATA_OP_NR)
1058                 return -EINVAL;
1059 
1060         bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
1061 
1062         switch (op.op) {
1063         case BCH_DATA_OP_rereplicate:
1064                 stats->data_type = BCH_DATA_journal;
1065                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1066                 ret = bch2_move_btree(c, start, end,
1067                                       rereplicate_btree_pred, c, stats) ?: ret;
1068                 ret = bch2_move_data(c, start, end,
1069                                      NULL,
1070                                      stats,
1071                                      writepoint_hashed((unsigned long) current),
1072                                      true,
1073                                      rereplicate_pred, c) ?: ret;
1074                 ret = bch2_replicas_gc2(c) ?: ret;
1075                 break;
1076         case BCH_DATA_OP_migrate:
1077                 if (op.migrate.dev >= c->sb.nr_devices)
1078                         return -EINVAL;
1079 
1080                 stats->data_type = BCH_DATA_journal;
1081                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1082                 ret = bch2_move_btree(c, start, end,
1083                                       migrate_btree_pred, &op, stats) ?: ret;
1084                 ret = bch2_move_data(c, start, end,
1085                                      NULL,
1086                                      stats,
1087                                      writepoint_hashed((unsigned long) current),
1088                                      true,
1089                                      migrate_pred, &op) ?: ret;
1090                 ret = bch2_replicas_gc2(c) ?: ret;
1091                 break;
1092         case BCH_DATA_OP_rewrite_old_nodes:
1093                 ret = bch2_scan_old_btree_nodes(c, stats);
1094                 break;
1095         case BCH_DATA_OP_drop_extra_replicas:
1096                 ret = bch2_move_btree(c, start, end,
1097                                 drop_extra_replicas_btree_pred, c, stats) ?: ret;
1098                 ret = bch2_move_data(c, start, end, NULL, stats,
1099                                 writepoint_hashed((unsigned long) current),
1100                                 true,
1101                                 drop_extra_replicas_pred, c) ?: ret;
1102                 ret = bch2_replicas_gc2(c) ?: ret;
1103                 break;
1104         default:
1105                 ret = -EINVAL;
1106         }
1107 
1108         bch2_move_stats_exit(stats, c);
1109         return ret;
1110 }
1111 
1112 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1113 {
1114         prt_printf(out, "%s: data type==", stats->name);
1115         bch2_prt_data_type(out, stats->data_type);
1116         prt_str(out, " pos=");
1117         bch2_bbpos_to_text(out, stats->pos);
1118         prt_newline(out);
1119         printbuf_indent_add(out, 2);
1120 
1121         prt_printf(out, "keys moved:  %llu\n",  atomic64_read(&stats->keys_moved));
1122         prt_printf(out, "keys raced:  %llu\n",  atomic64_read(&stats->keys_raced));
1123         prt_printf(out, "bytes seen:  ");
1124         prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1125         prt_newline(out);
1126 
1127         prt_printf(out, "bytes moved: ");
1128         prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1129         prt_newline(out);
1130 
1131         prt_printf(out, "bytes raced: ");
1132         prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1133         prt_newline(out);
1134 
1135         printbuf_indent_sub(out, 2);
1136 }
1137 
1138 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1139 {
1140         struct moving_io *io;
1141 
1142         bch2_move_stats_to_text(out, ctxt->stats);
1143         printbuf_indent_add(out, 2);
1144 
1145         prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
1146                    atomic_read(&ctxt->read_ios),
1147                    c->opts.move_ios_in_flight,
1148                    atomic_read(&ctxt->read_sectors),
1149                    c->opts.move_bytes_in_flight >> 9);
1150 
1151         prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
1152                    atomic_read(&ctxt->write_ios),
1153                    c->opts.move_ios_in_flight,
1154                    atomic_read(&ctxt->write_sectors),
1155                    c->opts.move_bytes_in_flight >> 9);
1156 
1157         printbuf_indent_add(out, 2);
1158 
1159         mutex_lock(&ctxt->lock);
1160         list_for_each_entry(io, &ctxt->ios, io_list)
1161                 bch2_write_op_to_text(out, &io->write.op);
1162         mutex_unlock(&ctxt->lock);
1163 
1164         printbuf_indent_sub(out, 4);
1165 }
1166 
1167 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1168 {
1169         struct moving_context *ctxt;
1170 
1171         mutex_lock(&c->moving_context_lock);
1172         list_for_each_entry(ctxt, &c->moving_context_list, list)
1173                 bch2_moving_ctxt_to_text(out, c, ctxt);
1174         mutex_unlock(&c->moving_context_lock);
1175 }
1176 
1177 void bch2_fs_move_init(struct bch_fs *c)
1178 {
1179         INIT_LIST_HEAD(&c->moving_context_list);
1180         mutex_init(&c->moving_context_lock);
1181 }
1182 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php