~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/super.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * bcachefs setup/teardown code, and some metadata io - read a superblock and
  4  * figure out what to do with it.
  5  *
  6  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  7  * Copyright 2012 Google, Inc.
  8  */
  9 
 10 #include "bcachefs.h"
 11 #include "alloc_background.h"
 12 #include "alloc_foreground.h"
 13 #include "bkey_sort.h"
 14 #include "btree_cache.h"
 15 #include "btree_gc.h"
 16 #include "btree_journal_iter.h"
 17 #include "btree_key_cache.h"
 18 #include "btree_node_scan.h"
 19 #include "btree_update_interior.h"
 20 #include "btree_io.h"
 21 #include "btree_write_buffer.h"
 22 #include "buckets_waiting_for_journal.h"
 23 #include "chardev.h"
 24 #include "checksum.h"
 25 #include "clock.h"
 26 #include "compress.h"
 27 #include "debug.h"
 28 #include "disk_accounting.h"
 29 #include "disk_groups.h"
 30 #include "ec.h"
 31 #include "errcode.h"
 32 #include "error.h"
 33 #include "fs.h"
 34 #include "fs-io.h"
 35 #include "fs-io-buffered.h"
 36 #include "fs-io-direct.h"
 37 #include "fsck.h"
 38 #include "inode.h"
 39 #include "io_read.h"
 40 #include "io_write.h"
 41 #include "journal.h"
 42 #include "journal_reclaim.h"
 43 #include "journal_seq_blacklist.h"
 44 #include "move.h"
 45 #include "migrate.h"
 46 #include "movinggc.h"
 47 #include "nocow_locking.h"
 48 #include "quota.h"
 49 #include "rebalance.h"
 50 #include "recovery.h"
 51 #include "replicas.h"
 52 #include "sb-clean.h"
 53 #include "sb-counters.h"
 54 #include "sb-errors.h"
 55 #include "sb-members.h"
 56 #include "snapshot.h"
 57 #include "subvolume.h"
 58 #include "super.h"
 59 #include "super-io.h"
 60 #include "sysfs.h"
 61 #include "thread_with_file.h"
 62 #include "trace.h"
 63 
 64 #include <linux/backing-dev.h>
 65 #include <linux/blkdev.h>
 66 #include <linux/debugfs.h>
 67 #include <linux/device.h>
 68 #include <linux/idr.h>
 69 #include <linux/module.h>
 70 #include <linux/percpu.h>
 71 #include <linux/random.h>
 72 #include <linux/sysfs.h>
 73 #include <crypto/hash.h>
 74 
 75 MODULE_LICENSE("GPL");
 76 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
 77 MODULE_DESCRIPTION("bcachefs filesystem");
 78 MODULE_SOFTDEP("pre: crc32c");
 79 MODULE_SOFTDEP("pre: crc64");
 80 MODULE_SOFTDEP("pre: sha256");
 81 MODULE_SOFTDEP("pre: chacha20");
 82 MODULE_SOFTDEP("pre: poly1305");
 83 MODULE_SOFTDEP("pre: xxhash");
 84 
 85 const char * const bch2_fs_flag_strs[] = {
 86 #define x(n)            #n,
 87         BCH_FS_FLAGS()
 88 #undef x
 89         NULL
 90 };
 91 
 92 void bch2_print_str(struct bch_fs *c, const char *str)
 93 {
 94 #ifdef __KERNEL__
 95         struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
 96 
 97         if (unlikely(stdio)) {
 98                 bch2_stdio_redirect_printf(stdio, true, "%s", str);
 99                 return;
100         }
101 #endif
102         bch2_print_string_as_lines(KERN_ERR, str);
103 }
104 
105 __printf(2, 0)
106 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
107 {
108 #ifdef __KERNEL__
109         if (unlikely(stdio)) {
110                 if (fmt[0] == KERN_SOH[0])
111                         fmt += 2;
112 
113                 bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
114                 return;
115         }
116 #endif
117         vprintk(fmt, args);
118 }
119 
120 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
121 {
122         struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
123 
124         va_list args;
125         va_start(args, fmt);
126         bch2_print_maybe_redirect(stdio, fmt, args);
127         va_end(args);
128 }
129 
130 void __bch2_print(struct bch_fs *c, const char *fmt, ...)
131 {
132         struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
133 
134         va_list args;
135         va_start(args, fmt);
136         bch2_print_maybe_redirect(stdio, fmt, args);
137         va_end(args);
138 }
139 
140 #define KTYPE(type)                                                     \
141 static const struct attribute_group type ## _group = {                  \
142         .attrs = type ## _files                                         \
143 };                                                                      \
144                                                                         \
145 static const struct attribute_group *type ## _groups[] = {              \
146         &type ## _group,                                                \
147         NULL                                                            \
148 };                                                                      \
149                                                                         \
150 static const struct kobj_type type ## _ktype = {                        \
151         .release        = type ## _release,                             \
152         .sysfs_ops      = &type ## _sysfs_ops,                          \
153         .default_groups = type ## _groups                               \
154 }
155 
156 static void bch2_fs_release(struct kobject *);
157 static void bch2_dev_release(struct kobject *);
158 static void bch2_fs_counters_release(struct kobject *k)
159 {
160 }
161 
162 static void bch2_fs_internal_release(struct kobject *k)
163 {
164 }
165 
166 static void bch2_fs_opts_dir_release(struct kobject *k)
167 {
168 }
169 
170 static void bch2_fs_time_stats_release(struct kobject *k)
171 {
172 }
173 
174 KTYPE(bch2_fs);
175 KTYPE(bch2_fs_counters);
176 KTYPE(bch2_fs_internal);
177 KTYPE(bch2_fs_opts_dir);
178 KTYPE(bch2_fs_time_stats);
179 KTYPE(bch2_dev);
180 
181 static struct kset *bcachefs_kset;
182 static LIST_HEAD(bch_fs_list);
183 static DEFINE_MUTEX(bch_fs_list_lock);
184 
185 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
186 
187 static void bch2_dev_free(struct bch_dev *);
188 static int bch2_dev_alloc(struct bch_fs *, unsigned);
189 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
190 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
191 
192 struct bch_fs *bch2_dev_to_fs(dev_t dev)
193 {
194         struct bch_fs *c;
195 
196         mutex_lock(&bch_fs_list_lock);
197         rcu_read_lock();
198 
199         list_for_each_entry(c, &bch_fs_list, list)
200                 for_each_member_device_rcu(c, ca, NULL)
201                         if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
202                                 closure_get(&c->cl);
203                                 goto found;
204                         }
205         c = NULL;
206 found:
207         rcu_read_unlock();
208         mutex_unlock(&bch_fs_list_lock);
209 
210         return c;
211 }
212 
213 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
214 {
215         struct bch_fs *c;
216 
217         lockdep_assert_held(&bch_fs_list_lock);
218 
219         list_for_each_entry(c, &bch_fs_list, list)
220                 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
221                         return c;
222 
223         return NULL;
224 }
225 
226 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
227 {
228         struct bch_fs *c;
229 
230         mutex_lock(&bch_fs_list_lock);
231         c = __bch2_uuid_to_fs(uuid);
232         if (c)
233                 closure_get(&c->cl);
234         mutex_unlock(&bch_fs_list_lock);
235 
236         return c;
237 }
238 
239 /* Filesystem RO/RW: */
240 
241 /*
242  * For startup/shutdown of RW stuff, the dependencies are:
243  *
244  * - foreground writes depend on copygc and rebalance (to free up space)
245  *
246  * - copygc and rebalance depend on mark and sweep gc (they actually probably
247  *   don't because they either reserve ahead of time or don't block if
248  *   allocations fail, but allocations can require mark and sweep gc to run
249  *   because of generation number wraparound)
250  *
251  * - all of the above depends on the allocator threads
252  *
253  * - allocator depends on the journal (when it rewrites prios and gens)
254  */
255 
256 static void __bch2_fs_read_only(struct bch_fs *c)
257 {
258         unsigned clean_passes = 0;
259         u64 seq = 0;
260 
261         bch2_fs_ec_stop(c);
262         bch2_open_buckets_stop(c, NULL, true);
263         bch2_rebalance_stop(c);
264         bch2_copygc_stop(c);
265         bch2_fs_ec_flush(c);
266 
267         bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
268                     journal_cur_seq(&c->journal));
269 
270         do {
271                 clean_passes++;
272 
273                 if (bch2_btree_interior_updates_flush(c) ||
274                     bch2_journal_flush_all_pins(&c->journal) ||
275                     bch2_btree_flush_all_writes(c) ||
276                     seq != atomic64_read(&c->journal.seq)) {
277                         seq = atomic64_read(&c->journal.seq);
278                         clean_passes = 0;
279                 }
280         } while (clean_passes < 2);
281 
282         bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
283                     journal_cur_seq(&c->journal));
284 
285         if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
286             !test_bit(BCH_FS_emergency_ro, &c->flags))
287                 set_bit(BCH_FS_clean_shutdown, &c->flags);
288 
289         bch2_fs_journal_stop(&c->journal);
290 
291         bch_info(c, "%sshutdown complete, journal seq %llu",
292                  test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
293                  c->journal.seq_ondisk);
294 
295         /*
296          * After stopping journal:
297          */
298         for_each_member_device(c, ca)
299                 bch2_dev_allocator_remove(c, ca);
300 }
301 
302 #ifndef BCH_WRITE_REF_DEBUG
303 static void bch2_writes_disabled(struct percpu_ref *writes)
304 {
305         struct bch_fs *c = container_of(writes, struct bch_fs, writes);
306 
307         set_bit(BCH_FS_write_disable_complete, &c->flags);
308         wake_up(&bch2_read_only_wait);
309 }
310 #endif
311 
312 void bch2_fs_read_only(struct bch_fs *c)
313 {
314         if (!test_bit(BCH_FS_rw, &c->flags)) {
315                 bch2_journal_reclaim_stop(&c->journal);
316                 return;
317         }
318 
319         BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
320 
321         bch_verbose(c, "going read-only");
322 
323         /*
324          * Block new foreground-end write operations from starting - any new
325          * writes will return -EROFS:
326          */
327         set_bit(BCH_FS_going_ro, &c->flags);
328 #ifndef BCH_WRITE_REF_DEBUG
329         percpu_ref_kill(&c->writes);
330 #else
331         for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
332                 bch2_write_ref_put(c, i);
333 #endif
334 
335         /*
336          * If we're not doing an emergency shutdown, we want to wait on
337          * outstanding writes to complete so they don't see spurious errors due
338          * to shutting down the allocator:
339          *
340          * If we are doing an emergency shutdown outstanding writes may
341          * hang until we shutdown the allocator so we don't want to wait
342          * on outstanding writes before shutting everything down - but
343          * we do need to wait on them before returning and signalling
344          * that going RO is complete:
345          */
346         wait_event(bch2_read_only_wait,
347                    test_bit(BCH_FS_write_disable_complete, &c->flags) ||
348                    test_bit(BCH_FS_emergency_ro, &c->flags));
349 
350         bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
351         if (writes_disabled)
352                 bch_verbose(c, "finished waiting for writes to stop");
353 
354         __bch2_fs_read_only(c);
355 
356         wait_event(bch2_read_only_wait,
357                    test_bit(BCH_FS_write_disable_complete, &c->flags));
358 
359         if (!writes_disabled)
360                 bch_verbose(c, "finished waiting for writes to stop");
361 
362         clear_bit(BCH_FS_write_disable_complete, &c->flags);
363         clear_bit(BCH_FS_going_ro, &c->flags);
364         clear_bit(BCH_FS_rw, &c->flags);
365 
366         if (!bch2_journal_error(&c->journal) &&
367             !test_bit(BCH_FS_error, &c->flags) &&
368             !test_bit(BCH_FS_emergency_ro, &c->flags) &&
369             test_bit(BCH_FS_started, &c->flags) &&
370             test_bit(BCH_FS_clean_shutdown, &c->flags) &&
371             c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
372                 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
373                 BUG_ON(atomic_read(&c->btree_cache.dirty));
374                 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
375                 BUG_ON(c->btree_write_buffer.inc.keys.nr);
376                 BUG_ON(c->btree_write_buffer.flushing.keys.nr);
377                 bch2_verify_accounting_clean(c);
378 
379                 bch_verbose(c, "marking filesystem clean");
380                 bch2_fs_mark_clean(c);
381         } else {
382                 bch_verbose(c, "done going read-only, filesystem not clean");
383         }
384 }
385 
386 static void bch2_fs_read_only_work(struct work_struct *work)
387 {
388         struct bch_fs *c =
389                 container_of(work, struct bch_fs, read_only_work);
390 
391         down_write(&c->state_lock);
392         bch2_fs_read_only(c);
393         up_write(&c->state_lock);
394 }
395 
396 static void bch2_fs_read_only_async(struct bch_fs *c)
397 {
398         queue_work(system_long_wq, &c->read_only_work);
399 }
400 
401 bool bch2_fs_emergency_read_only(struct bch_fs *c)
402 {
403         bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
404 
405         bch2_journal_halt(&c->journal);
406         bch2_fs_read_only_async(c);
407 
408         wake_up(&bch2_read_only_wait);
409         return ret;
410 }
411 
412 static int bch2_fs_read_write_late(struct bch_fs *c)
413 {
414         int ret;
415 
416         /*
417          * Data move operations can't run until after check_snapshots has
418          * completed, and bch2_snapshot_is_ancestor() is available.
419          *
420          * Ideally we'd start copygc/rebalance earlier instead of waiting for
421          * all of recovery/fsck to complete:
422          */
423         ret = bch2_copygc_start(c);
424         if (ret) {
425                 bch_err(c, "error starting copygc thread");
426                 return ret;
427         }
428 
429         ret = bch2_rebalance_start(c);
430         if (ret) {
431                 bch_err(c, "error starting rebalance thread");
432                 return ret;
433         }
434 
435         return 0;
436 }
437 
438 static int __bch2_fs_read_write(struct bch_fs *c, bool early)
439 {
440         int ret;
441 
442         if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
443                 bch_err(c, "cannot go rw, unfixed btree errors");
444                 return -BCH_ERR_erofs_unfixed_errors;
445         }
446 
447         if (test_bit(BCH_FS_rw, &c->flags))
448                 return 0;
449 
450         bch_info(c, "going read-write");
451 
452         ret = bch2_sb_members_v2_init(c);
453         if (ret)
454                 goto err;
455 
456         ret = bch2_fs_mark_dirty(c);
457         if (ret)
458                 goto err;
459 
460         clear_bit(BCH_FS_clean_shutdown, &c->flags);
461 
462         /*
463          * First journal write must be a flush write: after a clean shutdown we
464          * don't read the journal, so the first journal write may end up
465          * overwriting whatever was there previously, and there must always be
466          * at least one non-flush write in the journal or recovery will fail:
467          */
468         set_bit(JOURNAL_need_flush_write, &c->journal.flags);
469         set_bit(JOURNAL_running, &c->journal.flags);
470 
471         for_each_rw_member(c, ca)
472                 bch2_dev_allocator_add(c, ca);
473         bch2_recalc_capacity(c);
474 
475         set_bit(BCH_FS_rw, &c->flags);
476         set_bit(BCH_FS_was_rw, &c->flags);
477 
478 #ifndef BCH_WRITE_REF_DEBUG
479         percpu_ref_reinit(&c->writes);
480 #else
481         for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
482                 BUG_ON(atomic_long_read(&c->writes[i]));
483                 atomic_long_inc(&c->writes[i]);
484         }
485 #endif
486 
487         ret = bch2_journal_reclaim_start(&c->journal);
488         if (ret)
489                 goto err;
490 
491         if (!early) {
492                 ret = bch2_fs_read_write_late(c);
493                 if (ret)
494                         goto err;
495         }
496 
497         bch2_do_discards(c);
498         bch2_do_invalidates(c);
499         bch2_do_stripe_deletes(c);
500         bch2_do_pending_node_rewrites(c);
501         return 0;
502 err:
503         if (test_bit(BCH_FS_rw, &c->flags))
504                 bch2_fs_read_only(c);
505         else
506                 __bch2_fs_read_only(c);
507         return ret;
508 }
509 
510 int bch2_fs_read_write(struct bch_fs *c)
511 {
512         if (c->opts.recovery_pass_last &&
513             c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
514                 return -BCH_ERR_erofs_norecovery;
515 
516         if (c->opts.nochanges)
517                 return -BCH_ERR_erofs_nochanges;
518 
519         return __bch2_fs_read_write(c, false);
520 }
521 
522 int bch2_fs_read_write_early(struct bch_fs *c)
523 {
524         lockdep_assert_held(&c->state_lock);
525 
526         return __bch2_fs_read_write(c, true);
527 }
528 
529 /* Filesystem startup/shutdown: */
530 
531 static void __bch2_fs_free(struct bch_fs *c)
532 {
533         for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
534                 bch2_time_stats_exit(&c->times[i]);
535 
536         bch2_find_btree_nodes_exit(&c->found_btree_nodes);
537         bch2_free_pending_node_rewrites(c);
538         bch2_fs_accounting_exit(c);
539         bch2_fs_sb_errors_exit(c);
540         bch2_fs_counters_exit(c);
541         bch2_fs_snapshots_exit(c);
542         bch2_fs_quota_exit(c);
543         bch2_fs_fs_io_direct_exit(c);
544         bch2_fs_fs_io_buffered_exit(c);
545         bch2_fs_fsio_exit(c);
546         bch2_fs_ec_exit(c);
547         bch2_fs_encryption_exit(c);
548         bch2_fs_nocow_locking_exit(c);
549         bch2_fs_io_write_exit(c);
550         bch2_fs_io_read_exit(c);
551         bch2_fs_buckets_waiting_for_journal_exit(c);
552         bch2_fs_btree_interior_update_exit(c);
553         bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
554         bch2_fs_btree_cache_exit(c);
555         bch2_fs_btree_iter_exit(c);
556         bch2_fs_replicas_exit(c);
557         bch2_fs_journal_exit(&c->journal);
558         bch2_io_clock_exit(&c->io_clock[WRITE]);
559         bch2_io_clock_exit(&c->io_clock[READ]);
560         bch2_fs_compress_exit(c);
561         bch2_journal_keys_put_initial(c);
562         bch2_find_btree_nodes_exit(&c->found_btree_nodes);
563         BUG_ON(atomic_read(&c->journal_keys.ref));
564         bch2_fs_btree_write_buffer_exit(c);
565         percpu_free_rwsem(&c->mark_lock);
566         if (c->online_reserved) {
567                 u64 v = percpu_u64_get(c->online_reserved);
568                 WARN(v, "online_reserved not 0 at shutdown: %lli", v);
569                 free_percpu(c->online_reserved);
570         }
571 
572         darray_exit(&c->btree_roots_extra);
573         free_percpu(c->pcpu);
574         free_percpu(c->usage);
575         mempool_exit(&c->large_bkey_pool);
576         mempool_exit(&c->btree_bounce_pool);
577         bioset_exit(&c->btree_bio);
578         mempool_exit(&c->fill_iter);
579 #ifndef BCH_WRITE_REF_DEBUG
580         percpu_ref_exit(&c->writes);
581 #endif
582         kfree(rcu_dereference_protected(c->disk_groups, 1));
583         kfree(c->journal_seq_blacklist_table);
584         kfree(c->unused_inode_hints);
585 
586         if (c->write_ref_wq)
587                 destroy_workqueue(c->write_ref_wq);
588         if (c->btree_write_submit_wq)
589                 destroy_workqueue(c->btree_write_submit_wq);
590         if (c->btree_read_complete_wq)
591                 destroy_workqueue(c->btree_read_complete_wq);
592         if (c->copygc_wq)
593                 destroy_workqueue(c->copygc_wq);
594         if (c->btree_io_complete_wq)
595                 destroy_workqueue(c->btree_io_complete_wq);
596         if (c->btree_update_wq)
597                 destroy_workqueue(c->btree_update_wq);
598 
599         bch2_free_super(&c->disk_sb);
600         kvfree(c);
601         module_put(THIS_MODULE);
602 }
603 
604 static void bch2_fs_release(struct kobject *kobj)
605 {
606         struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
607 
608         __bch2_fs_free(c);
609 }
610 
611 void __bch2_fs_stop(struct bch_fs *c)
612 {
613         bch_verbose(c, "shutting down");
614 
615         set_bit(BCH_FS_stopping, &c->flags);
616 
617         down_write(&c->state_lock);
618         bch2_fs_read_only(c);
619         up_write(&c->state_lock);
620 
621         for_each_member_device(c, ca)
622                 if (ca->kobj.state_in_sysfs &&
623                     ca->disk_sb.bdev)
624                         sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
625 
626         if (c->kobj.state_in_sysfs)
627                 kobject_del(&c->kobj);
628 
629         bch2_fs_debug_exit(c);
630         bch2_fs_chardev_exit(c);
631 
632         bch2_ro_ref_put(c);
633         wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
634 
635         kobject_put(&c->counters_kobj);
636         kobject_put(&c->time_stats);
637         kobject_put(&c->opts_dir);
638         kobject_put(&c->internal);
639 
640         /* btree prefetch might have kicked off reads in the background: */
641         bch2_btree_flush_all_reads(c);
642 
643         for_each_member_device(c, ca)
644                 cancel_work_sync(&ca->io_error_work);
645 
646         cancel_work_sync(&c->read_only_work);
647 }
648 
649 void bch2_fs_free(struct bch_fs *c)
650 {
651         unsigned i;
652 
653         mutex_lock(&bch_fs_list_lock);
654         list_del(&c->list);
655         mutex_unlock(&bch_fs_list_lock);
656 
657         closure_sync(&c->cl);
658         closure_debug_destroy(&c->cl);
659 
660         for (i = 0; i < c->sb.nr_devices; i++) {
661                 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
662 
663                 if (ca) {
664                         EBUG_ON(atomic_long_read(&ca->ref) != 1);
665                         bch2_free_super(&ca->disk_sb);
666                         bch2_dev_free(ca);
667                 }
668         }
669 
670         bch_verbose(c, "shutdown complete");
671 
672         kobject_put(&c->kobj);
673 }
674 
675 void bch2_fs_stop(struct bch_fs *c)
676 {
677         __bch2_fs_stop(c);
678         bch2_fs_free(c);
679 }
680 
681 static int bch2_fs_online(struct bch_fs *c)
682 {
683         int ret = 0;
684 
685         lockdep_assert_held(&bch_fs_list_lock);
686 
687         if (__bch2_uuid_to_fs(c->sb.uuid)) {
688                 bch_err(c, "filesystem UUID already open");
689                 return -EINVAL;
690         }
691 
692         ret = bch2_fs_chardev_init(c);
693         if (ret) {
694                 bch_err(c, "error creating character device");
695                 return ret;
696         }
697 
698         bch2_fs_debug_init(c);
699 
700         ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
701             kobject_add(&c->internal, &c->kobj, "internal") ?:
702             kobject_add(&c->opts_dir, &c->kobj, "options") ?:
703 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
704             kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
705 #endif
706             kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
707             bch2_opts_create_sysfs_files(&c->opts_dir);
708         if (ret) {
709                 bch_err(c, "error creating sysfs objects");
710                 return ret;
711         }
712 
713         down_write(&c->state_lock);
714 
715         for_each_member_device(c, ca) {
716                 ret = bch2_dev_sysfs_online(c, ca);
717                 if (ret) {
718                         bch_err(c, "error creating sysfs objects");
719                         bch2_dev_put(ca);
720                         goto err;
721                 }
722         }
723 
724         BUG_ON(!list_empty(&c->list));
725         list_add(&c->list, &bch_fs_list);
726 err:
727         up_write(&c->state_lock);
728         return ret;
729 }
730 
731 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
732 {
733         struct bch_fs *c;
734         struct printbuf name = PRINTBUF;
735         unsigned i, iter_size;
736         int ret = 0;
737 
738         c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
739         if (!c) {
740                 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
741                 goto out;
742         }
743 
744         c->stdio = (void *)(unsigned long) opts.stdio;
745 
746         __module_get(THIS_MODULE);
747 
748         closure_init(&c->cl, NULL);
749 
750         c->kobj.kset = bcachefs_kset;
751         kobject_init(&c->kobj, &bch2_fs_ktype);
752         kobject_init(&c->internal, &bch2_fs_internal_ktype);
753         kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
754         kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
755         kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
756 
757         c->minor                = -1;
758         c->disk_sb.fs_sb        = true;
759 
760         init_rwsem(&c->state_lock);
761         mutex_init(&c->sb_lock);
762         mutex_init(&c->replicas_gc_lock);
763         mutex_init(&c->btree_root_lock);
764         INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
765 
766         refcount_set(&c->ro_ref, 1);
767         init_waitqueue_head(&c->ro_ref_wait);
768         sema_init(&c->online_fsck_mutex, 1);
769 
770         init_rwsem(&c->gc_lock);
771         mutex_init(&c->gc_gens_lock);
772         atomic_set(&c->journal_keys.ref, 1);
773         c->journal_keys.initial_ref_held = true;
774 
775         for (i = 0; i < BCH_TIME_STAT_NR; i++)
776                 bch2_time_stats_init(&c->times[i]);
777 
778         bch2_fs_gc_init(c);
779         bch2_fs_copygc_init(c);
780         bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
781         bch2_fs_btree_iter_init_early(c);
782         bch2_fs_btree_interior_update_init_early(c);
783         bch2_fs_allocator_background_init(c);
784         bch2_fs_allocator_foreground_init(c);
785         bch2_fs_rebalance_init(c);
786         bch2_fs_quota_init(c);
787         bch2_fs_ec_init_early(c);
788         bch2_fs_move_init(c);
789         bch2_fs_sb_errors_init_early(c);
790 
791         INIT_LIST_HEAD(&c->list);
792 
793         mutex_init(&c->bio_bounce_pages_lock);
794         mutex_init(&c->snapshot_table_lock);
795         init_rwsem(&c->snapshot_create_lock);
796 
797         spin_lock_init(&c->btree_write_error_lock);
798 
799         INIT_LIST_HEAD(&c->journal_iters);
800 
801         INIT_LIST_HEAD(&c->fsck_error_msgs);
802         mutex_init(&c->fsck_error_msgs_lock);
803 
804         seqcount_init(&c->usage_lock);
805 
806         sema_init(&c->io_in_flight, 128);
807 
808         INIT_LIST_HEAD(&c->vfs_inodes_list);
809         mutex_init(&c->vfs_inodes_lock);
810 
811         c->copy_gc_enabled              = 1;
812         c->rebalance.enabled            = 1;
813         c->promote_whole_extents        = true;
814 
815         c->journal.flush_write_time     = &c->times[BCH_TIME_journal_flush_write];
816         c->journal.noflush_write_time   = &c->times[BCH_TIME_journal_noflush_write];
817         c->journal.flush_seq_time       = &c->times[BCH_TIME_journal_flush_seq];
818 
819         bch2_fs_btree_cache_init_early(&c->btree_cache);
820 
821         mutex_init(&c->sectors_available_lock);
822 
823         ret = percpu_init_rwsem(&c->mark_lock);
824         if (ret)
825                 goto err;
826 
827         mutex_lock(&c->sb_lock);
828         ret = bch2_sb_to_fs(c, sb);
829         mutex_unlock(&c->sb_lock);
830 
831         if (ret)
832                 goto err;
833 
834         pr_uuid(&name, c->sb.user_uuid.b);
835         ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
836         if (ret)
837                 goto err;
838 
839         strscpy(c->name, name.buf, sizeof(c->name));
840         printbuf_exit(&name);
841 
842         /* Compat: */
843         if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
844             !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
845                 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
846 
847         if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
848             !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
849                 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
850 
851         c->opts = bch2_opts_default;
852         ret = bch2_opts_from_sb(&c->opts, sb);
853         if (ret)
854                 goto err;
855 
856         bch2_opts_apply(&c->opts, opts);
857 
858         c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
859         if (c->opts.inodes_use_key_cache)
860                 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
861         c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
862 
863         c->block_bits           = ilog2(block_sectors(c));
864         c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
865 
866         if (bch2_fs_init_fault("fs_alloc")) {
867                 bch_err(c, "fs_alloc fault injected");
868                 ret = -EFAULT;
869                 goto err;
870         }
871 
872         iter_size = sizeof(struct sort_iter) +
873                 (btree_blocks(c) + 1) * 2 *
874                 sizeof(struct sort_iter_set);
875 
876         c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
877 
878         if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
879                                 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
880             !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
881                                 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
882             !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
883                                 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
884             !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
885                                 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
886             !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
887                                 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
888             !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
889                                 WQ_FREEZABLE, 0)) ||
890 #ifndef BCH_WRITE_REF_DEBUG
891             percpu_ref_init(&c->writes, bch2_writes_disabled,
892                             PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
893 #endif
894             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
895             bioset_init(&c->btree_bio, 1,
896                         max(offsetof(struct btree_read_bio, bio),
897                             offsetof(struct btree_write_bio, wbio.bio)),
898                         BIOSET_NEED_BVECS) ||
899             !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
900             !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
901             !(c->online_reserved = alloc_percpu(u64)) ||
902             mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
903                                        c->opts.btree_node_size) ||
904             mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
905             !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
906                                               sizeof(u64), GFP_KERNEL))) {
907                 ret = -BCH_ERR_ENOMEM_fs_other_alloc;
908                 goto err;
909         }
910 
911         ret = bch2_fs_counters_init(c) ?:
912             bch2_fs_sb_errors_init(c) ?:
913             bch2_io_clock_init(&c->io_clock[READ]) ?:
914             bch2_io_clock_init(&c->io_clock[WRITE]) ?:
915             bch2_fs_journal_init(&c->journal) ?:
916             bch2_fs_btree_iter_init(c) ?:
917             bch2_fs_btree_cache_init(c) ?:
918             bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
919             bch2_fs_btree_interior_update_init(c) ?:
920             bch2_fs_buckets_waiting_for_journal_init(c) ?:
921             bch2_fs_btree_write_buffer_init(c) ?:
922             bch2_fs_subvolumes_init(c) ?:
923             bch2_fs_io_read_init(c) ?:
924             bch2_fs_io_write_init(c) ?:
925             bch2_fs_nocow_locking_init(c) ?:
926             bch2_fs_encryption_init(c) ?:
927             bch2_fs_compress_init(c) ?:
928             bch2_fs_ec_init(c) ?:
929             bch2_fs_fsio_init(c) ?:
930             bch2_fs_fs_io_buffered_init(c) ?:
931             bch2_fs_fs_io_direct_init(c);
932         if (ret)
933                 goto err;
934 
935         for (i = 0; i < c->sb.nr_devices; i++) {
936                 if (!bch2_member_exists(c->disk_sb.sb, i))
937                         continue;
938                 ret = bch2_dev_alloc(c, i);
939                 if (ret)
940                         goto err;
941         }
942 
943         bch2_journal_entry_res_resize(&c->journal,
944                         &c->btree_root_journal_res,
945                         BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
946         bch2_journal_entry_res_resize(&c->journal,
947                         &c->clock_journal_res,
948                         (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
949 
950         mutex_lock(&bch_fs_list_lock);
951         ret = bch2_fs_online(c);
952         mutex_unlock(&bch_fs_list_lock);
953 
954         if (ret)
955                 goto err;
956 out:
957         return c;
958 err:
959         bch2_fs_free(c);
960         c = ERR_PTR(ret);
961         goto out;
962 }
963 
964 noinline_for_stack
965 static void print_mount_opts(struct bch_fs *c)
966 {
967         enum bch_opt_id i;
968         struct printbuf p = PRINTBUF;
969         bool first = true;
970 
971         prt_str(&p, "starting version ");
972         bch2_version_to_text(&p, c->sb.version);
973 
974         if (c->opts.read_only) {
975                 prt_str(&p, " opts=");
976                 first = false;
977                 prt_printf(&p, "ro");
978         }
979 
980         for (i = 0; i < bch2_opts_nr; i++) {
981                 const struct bch_option *opt = &bch2_opt_table[i];
982                 u64 v = bch2_opt_get_by_id(&c->opts, i);
983 
984                 if (!(opt->flags & OPT_MOUNT))
985                         continue;
986 
987                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
988                         continue;
989 
990                 prt_str(&p, first ? " opts=" : ",");
991                 first = false;
992                 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
993         }
994 
995         bch_info(c, "%s", p.buf);
996         printbuf_exit(&p);
997 }
998 
999 int bch2_fs_start(struct bch_fs *c)
1000 {
1001         time64_t now = ktime_get_real_seconds();
1002         int ret;
1003 
1004         print_mount_opts(c);
1005 
1006         down_write(&c->state_lock);
1007 
1008         BUG_ON(test_bit(BCH_FS_started, &c->flags));
1009 
1010         mutex_lock(&c->sb_lock);
1011 
1012         ret = bch2_sb_members_v2_init(c);
1013         if (ret) {
1014                 mutex_unlock(&c->sb_lock);
1015                 goto err;
1016         }
1017 
1018         for_each_online_member(c, ca)
1019                 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
1020 
1021         struct bch_sb_field_ext *ext =
1022                 bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
1023         mutex_unlock(&c->sb_lock);
1024 
1025         if (!ext) {
1026                 bch_err(c, "insufficient space in superblock for sb_field_ext");
1027                 ret = -BCH_ERR_ENOSPC_sb;
1028                 goto err;
1029         }
1030 
1031         for_each_rw_member(c, ca)
1032                 bch2_dev_allocator_add(c, ca);
1033         bch2_recalc_capacity(c);
1034 
1035         ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
1036                 ? bch2_fs_recovery(c)
1037                 : bch2_fs_initialize(c);
1038         if (ret)
1039                 goto err;
1040 
1041         ret = bch2_opts_check_may_set(c);
1042         if (ret)
1043                 goto err;
1044 
1045         if (bch2_fs_init_fault("fs_start")) {
1046                 bch_err(c, "fs_start fault injected");
1047                 ret = -EINVAL;
1048                 goto err;
1049         }
1050 
1051         set_bit(BCH_FS_started, &c->flags);
1052 
1053         if (c->opts.read_only) {
1054                 bch2_fs_read_only(c);
1055         } else {
1056                 ret = !test_bit(BCH_FS_rw, &c->flags)
1057                         ? bch2_fs_read_write(c)
1058                         : bch2_fs_read_write_late(c);
1059                 if (ret)
1060                         goto err;
1061         }
1062 
1063         ret = 0;
1064 err:
1065         if (ret)
1066                 bch_err_msg(c, ret, "starting filesystem");
1067         else
1068                 bch_verbose(c, "done starting filesystem");
1069         up_write(&c->state_lock);
1070         return ret;
1071 }
1072 
1073 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
1074 {
1075         struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
1076 
1077         if (le16_to_cpu(sb->block_size) != block_sectors(c))
1078                 return -BCH_ERR_mismatched_block_size;
1079 
1080         if (le16_to_cpu(m.bucket_size) <
1081             BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
1082                 return -BCH_ERR_bucket_size_too_small;
1083 
1084         return 0;
1085 }
1086 
1087 static int bch2_dev_in_fs(struct bch_sb_handle *fs,
1088                           struct bch_sb_handle *sb,
1089                           struct bch_opts *opts)
1090 {
1091         if (fs == sb)
1092                 return 0;
1093 
1094         if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
1095                 return -BCH_ERR_device_not_a_member_of_filesystem;
1096 
1097         if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
1098                 return -BCH_ERR_device_has_been_removed;
1099 
1100         if (fs->sb->block_size != sb->sb->block_size)
1101                 return -BCH_ERR_mismatched_block_size;
1102 
1103         if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
1104             le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
1105                 return 0;
1106 
1107         if (fs->sb->seq == sb->sb->seq &&
1108             fs->sb->write_time != sb->sb->write_time) {
1109                 struct printbuf buf = PRINTBUF;
1110 
1111                 prt_str(&buf, "Split brain detected between ");
1112                 prt_bdevname(&buf, sb->bdev);
1113                 prt_str(&buf, " and ");
1114                 prt_bdevname(&buf, fs->bdev);
1115                 prt_char(&buf, ':');
1116                 prt_newline(&buf);
1117                 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
1118                 prt_newline(&buf);
1119 
1120                 prt_bdevname(&buf, fs->bdev);
1121                 prt_char(&buf, ' ');
1122                 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
1123                 prt_newline(&buf);
1124 
1125                 prt_bdevname(&buf, sb->bdev);
1126                 prt_char(&buf, ' ');
1127                 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
1128                 prt_newline(&buf);
1129 
1130                 if (!opts->no_splitbrain_check)
1131                         prt_printf(&buf, "Not using older sb");
1132 
1133                 pr_err("%s", buf.buf);
1134                 printbuf_exit(&buf);
1135 
1136                 if (!opts->no_splitbrain_check)
1137                         return -BCH_ERR_device_splitbrain;
1138         }
1139 
1140         struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
1141         u64 seq_from_fs         = le64_to_cpu(m.seq);
1142         u64 seq_from_member     = le64_to_cpu(sb->sb->seq);
1143 
1144         if (seq_from_fs && seq_from_fs < seq_from_member) {
1145                 struct printbuf buf = PRINTBUF;
1146 
1147                 prt_str(&buf, "Split brain detected between ");
1148                 prt_bdevname(&buf, sb->bdev);
1149                 prt_str(&buf, " and ");
1150                 prt_bdevname(&buf, fs->bdev);
1151                 prt_char(&buf, ':');
1152                 prt_newline(&buf);
1153 
1154                 prt_bdevname(&buf, fs->bdev);
1155                 prt_str(&buf, " believes seq of ");
1156                 prt_bdevname(&buf, sb->bdev);
1157                 prt_printf(&buf, " to be %llu, but ", seq_from_fs);
1158                 prt_bdevname(&buf, sb->bdev);
1159                 prt_printf(&buf, " has %llu\n", seq_from_member);
1160 
1161                 if (!opts->no_splitbrain_check) {
1162                         prt_str(&buf, "Not using ");
1163                         prt_bdevname(&buf, sb->bdev);
1164                 }
1165 
1166                 pr_err("%s", buf.buf);
1167                 printbuf_exit(&buf);
1168 
1169                 if (!opts->no_splitbrain_check)
1170                         return -BCH_ERR_device_splitbrain;
1171         }
1172 
1173         return 0;
1174 }
1175 
1176 /* Device startup/shutdown: */
1177 
1178 static void bch2_dev_release(struct kobject *kobj)
1179 {
1180         struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
1181 
1182         kfree(ca);
1183 }
1184 
1185 static void bch2_dev_free(struct bch_dev *ca)
1186 {
1187         cancel_work_sync(&ca->io_error_work);
1188 
1189         if (ca->kobj.state_in_sysfs &&
1190             ca->disk_sb.bdev)
1191                 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
1192 
1193         if (ca->kobj.state_in_sysfs)
1194                 kobject_del(&ca->kobj);
1195 
1196         bch2_free_super(&ca->disk_sb);
1197         bch2_dev_allocator_background_exit(ca);
1198         bch2_dev_journal_exit(ca);
1199 
1200         free_percpu(ca->io_done);
1201         bch2_dev_buckets_free(ca);
1202         free_page((unsigned long) ca->sb_read_scratch);
1203 
1204         bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
1205         bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
1206 
1207         percpu_ref_exit(&ca->io_ref);
1208 #ifndef CONFIG_BCACHEFS_DEBUG
1209         percpu_ref_exit(&ca->ref);
1210 #endif
1211         kobject_put(&ca->kobj);
1212 }
1213 
1214 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
1215 {
1216 
1217         lockdep_assert_held(&c->state_lock);
1218 
1219         if (percpu_ref_is_zero(&ca->io_ref))
1220                 return;
1221 
1222         __bch2_dev_read_only(c, ca);
1223 
1224         reinit_completion(&ca->io_ref_completion);
1225         percpu_ref_kill(&ca->io_ref);
1226         wait_for_completion(&ca->io_ref_completion);
1227 
1228         if (ca->kobj.state_in_sysfs) {
1229                 sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
1230                 sysfs_remove_link(&ca->kobj, "block");
1231         }
1232 
1233         bch2_free_super(&ca->disk_sb);
1234         bch2_dev_journal_exit(ca);
1235 }
1236 
1237 #ifndef CONFIG_BCACHEFS_DEBUG
1238 static void bch2_dev_ref_complete(struct percpu_ref *ref)
1239 {
1240         struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
1241 
1242         complete(&ca->ref_completion);
1243 }
1244 #endif
1245 
1246 static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
1247 {
1248         struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
1249 
1250         complete(&ca->io_ref_completion);
1251 }
1252 
1253 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
1254 {
1255         int ret;
1256 
1257         if (!c->kobj.state_in_sysfs)
1258                 return 0;
1259 
1260         if (!ca->kobj.state_in_sysfs) {
1261                 ret = kobject_add(&ca->kobj, &c->kobj,
1262                                   "dev-%u", ca->dev_idx);
1263                 if (ret)
1264                         return ret;
1265         }
1266 
1267         if (ca->disk_sb.bdev) {
1268                 struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
1269 
1270                 ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
1271                 if (ret)
1272                         return ret;
1273 
1274                 ret = sysfs_create_link(&ca->kobj, block, "block");
1275                 if (ret)
1276                         return ret;
1277         }
1278 
1279         return 0;
1280 }
1281 
1282 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
1283                                         struct bch_member *member)
1284 {
1285         struct bch_dev *ca;
1286         unsigned i;
1287 
1288         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1289         if (!ca)
1290                 return NULL;
1291 
1292         kobject_init(&ca->kobj, &bch2_dev_ktype);
1293         init_completion(&ca->ref_completion);
1294         init_completion(&ca->io_ref_completion);
1295 
1296         init_rwsem(&ca->bucket_lock);
1297 
1298         INIT_WORK(&ca->io_error_work, bch2_io_error_work);
1299 
1300         bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
1301         bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
1302 
1303         ca->mi = bch2_mi_to_cpu(member);
1304 
1305         for (i = 0; i < ARRAY_SIZE(member->errors); i++)
1306                 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
1307 
1308         ca->uuid = member->uuid;
1309 
1310         ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1311                              ca->mi.bucket_size / btree_sectors(c));
1312 
1313 #ifndef CONFIG_BCACHEFS_DEBUG
1314         if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
1315                 goto err;
1316 #else
1317         atomic_long_set(&ca->ref, 1);
1318 #endif
1319 
1320         bch2_dev_allocator_background_init(ca);
1321 
1322         if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
1323                             PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
1324             !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
1325             bch2_dev_buckets_alloc(c, ca) ||
1326             !(ca->io_done       = alloc_percpu(*ca->io_done)))
1327                 goto err;
1328 
1329         return ca;
1330 err:
1331         bch2_dev_free(ca);
1332         return NULL;
1333 }
1334 
1335 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
1336                             unsigned dev_idx)
1337 {
1338         ca->dev_idx = dev_idx;
1339         __set_bit(ca->dev_idx, ca->self.d);
1340         scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
1341 
1342         ca->fs = c;
1343         rcu_assign_pointer(c->devs[ca->dev_idx], ca);
1344 
1345         if (bch2_dev_sysfs_online(c, ca))
1346                 pr_warn("error creating sysfs objects");
1347 }
1348 
1349 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
1350 {
1351         struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
1352         struct bch_dev *ca = NULL;
1353         int ret = 0;
1354 
1355         if (bch2_fs_init_fault("dev_alloc"))
1356                 goto err;
1357 
1358         ca = __bch2_dev_alloc(c, &member);
1359         if (!ca)
1360                 goto err;
1361 
1362         ca->fs = c;
1363 
1364         bch2_dev_attach(c, ca, dev_idx);
1365         return ret;
1366 err:
1367         if (ca)
1368                 bch2_dev_free(ca);
1369         return -BCH_ERR_ENOMEM_dev_alloc;
1370 }
1371 
1372 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
1373 {
1374         unsigned ret;
1375 
1376         if (bch2_dev_is_online(ca)) {
1377                 bch_err(ca, "already have device online in slot %u",
1378                         sb->sb->dev_idx);
1379                 return -BCH_ERR_device_already_online;
1380         }
1381 
1382         if (get_capacity(sb->bdev->bd_disk) <
1383             ca->mi.bucket_size * ca->mi.nbuckets) {
1384                 bch_err(ca, "cannot online: device too small");
1385                 return -BCH_ERR_device_size_too_small;
1386         }
1387 
1388         BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
1389 
1390         ret = bch2_dev_journal_init(ca, sb->sb);
1391         if (ret)
1392                 return ret;
1393 
1394         /* Commit: */
1395         ca->disk_sb = *sb;
1396         memset(sb, 0, sizeof(*sb));
1397 
1398         ca->dev = ca->disk_sb.bdev->bd_dev;
1399 
1400         percpu_ref_reinit(&ca->io_ref);
1401 
1402         return 0;
1403 }
1404 
1405 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
1406 {
1407         struct bch_dev *ca;
1408         int ret;
1409 
1410         lockdep_assert_held(&c->state_lock);
1411 
1412         if (le64_to_cpu(sb->sb->seq) >
1413             le64_to_cpu(c->disk_sb.sb->seq))
1414                 bch2_sb_to_fs(c, sb->sb);
1415 
1416         BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
1417 
1418         ca = bch2_dev_locked(c, sb->sb->dev_idx);
1419 
1420         ret = __bch2_dev_attach_bdev(ca, sb);
1421         if (ret)
1422                 return ret;
1423 
1424         bch2_dev_sysfs_online(c, ca);
1425 
1426         struct printbuf name = PRINTBUF;
1427         prt_bdevname(&name, ca->disk_sb.bdev);
1428 
1429         if (c->sb.nr_devices == 1)
1430                 strscpy(c->name, name.buf, sizeof(c->name));
1431         strscpy(ca->name, name.buf, sizeof(ca->name));
1432 
1433         printbuf_exit(&name);
1434 
1435         rebalance_wakeup(c);
1436         return 0;
1437 }
1438 
1439 /* Device management: */
1440 
1441 /*
1442  * Note: this function is also used by the error paths - when a particular
1443  * device sees an error, we call it to determine whether we can just set the
1444  * device RO, or - if this function returns false - we'll set the whole
1445  * filesystem RO:
1446  *
1447  * XXX: maybe we should be more explicit about whether we're changing state
1448  * because we got an error or what have you?
1449  */
1450 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
1451                             enum bch_member_state new_state, int flags)
1452 {
1453         struct bch_devs_mask new_online_devs;
1454         int nr_rw = 0, required;
1455 
1456         lockdep_assert_held(&c->state_lock);
1457 
1458         switch (new_state) {
1459         case BCH_MEMBER_STATE_rw:
1460                 return true;
1461         case BCH_MEMBER_STATE_ro:
1462                 if (ca->mi.state != BCH_MEMBER_STATE_rw)
1463                         return true;
1464 
1465                 /* do we have enough devices to write to?  */
1466                 for_each_member_device(c, ca2)
1467                         if (ca2 != ca)
1468                                 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
1469 
1470                 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
1471                                ? c->opts.metadata_replicas
1472                                : metadata_replicas_required(c),
1473                                !(flags & BCH_FORCE_IF_DATA_DEGRADED)
1474                                ? c->opts.data_replicas
1475                                : data_replicas_required(c));
1476 
1477                 return nr_rw >= required;
1478         case BCH_MEMBER_STATE_failed:
1479         case BCH_MEMBER_STATE_spare:
1480                 if (ca->mi.state != BCH_MEMBER_STATE_rw &&
1481                     ca->mi.state != BCH_MEMBER_STATE_ro)
1482                         return true;
1483 
1484                 /* do we have enough devices to read from?  */
1485                 new_online_devs = bch2_online_devs(c);
1486                 __clear_bit(ca->dev_idx, new_online_devs.d);
1487 
1488                 return bch2_have_enough_devs(c, new_online_devs, flags, false);
1489         default:
1490                 BUG();
1491         }
1492 }
1493 
1494 static bool bch2_fs_may_start(struct bch_fs *c)
1495 {
1496         struct bch_dev *ca;
1497         unsigned i, flags = 0;
1498 
1499         if (c->opts.very_degraded)
1500                 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
1501 
1502         if (c->opts.degraded)
1503                 flags |= BCH_FORCE_IF_DEGRADED;
1504 
1505         if (!c->opts.degraded &&
1506             !c->opts.very_degraded) {
1507                 mutex_lock(&c->sb_lock);
1508 
1509                 for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
1510                         if (!bch2_member_exists(c->disk_sb.sb, i))
1511                                 continue;
1512 
1513                         ca = bch2_dev_locked(c, i);
1514 
1515                         if (!bch2_dev_is_online(ca) &&
1516                             (ca->mi.state == BCH_MEMBER_STATE_rw ||
1517                              ca->mi.state == BCH_MEMBER_STATE_ro)) {
1518                                 mutex_unlock(&c->sb_lock);
1519                                 return false;
1520                         }
1521                 }
1522                 mutex_unlock(&c->sb_lock);
1523         }
1524 
1525         return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
1526 }
1527 
1528 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
1529 {
1530         /*
1531          * The allocator thread itself allocates btree nodes, so stop it first:
1532          */
1533         bch2_dev_allocator_remove(c, ca);
1534         bch2_recalc_capacity(c);
1535         bch2_dev_journal_stop(&c->journal, ca);
1536 }
1537 
1538 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
1539 {
1540         lockdep_assert_held(&c->state_lock);
1541 
1542         BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
1543 
1544         bch2_dev_allocator_add(c, ca);
1545         bch2_recalc_capacity(c);
1546         bch2_dev_do_discards(ca);
1547 }
1548 
1549 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1550                          enum bch_member_state new_state, int flags)
1551 {
1552         struct bch_member *m;
1553         int ret = 0;
1554 
1555         if (ca->mi.state == new_state)
1556                 return 0;
1557 
1558         if (!bch2_dev_state_allowed(c, ca, new_state, flags))
1559                 return -BCH_ERR_device_state_not_allowed;
1560 
1561         if (new_state != BCH_MEMBER_STATE_rw)
1562                 __bch2_dev_read_only(c, ca);
1563 
1564         bch_notice(ca, "%s", bch2_member_states[new_state]);
1565 
1566         mutex_lock(&c->sb_lock);
1567         m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
1568         SET_BCH_MEMBER_STATE(m, new_state);
1569         bch2_write_super(c);
1570         mutex_unlock(&c->sb_lock);
1571 
1572         if (new_state == BCH_MEMBER_STATE_rw)
1573                 __bch2_dev_read_write(c, ca);
1574 
1575         rebalance_wakeup(c);
1576 
1577         return ret;
1578 }
1579 
1580 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1581                        enum bch_member_state new_state, int flags)
1582 {
1583         int ret;
1584 
1585         down_write(&c->state_lock);
1586         ret = __bch2_dev_set_state(c, ca, new_state, flags);
1587         up_write(&c->state_lock);
1588 
1589         return ret;
1590 }
1591 
1592 /* Device add/removal: */
1593 
1594 static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
1595 {
1596         struct bpos start       = POS(ca->dev_idx, 0);
1597         struct bpos end         = POS(ca->dev_idx, U64_MAX);
1598         int ret;
1599 
1600         /*
1601          * We clear the LRU and need_discard btrees first so that we don't race
1602          * with bch2_do_invalidates() and bch2_do_discards()
1603          */
1604         ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
1605                                         BTREE_TRIGGER_norun, NULL) ?:
1606                 bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
1607                                         BTREE_TRIGGER_norun, NULL) ?:
1608                 bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
1609                                         BTREE_TRIGGER_norun, NULL) ?:
1610                 bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
1611                                         BTREE_TRIGGER_norun, NULL) ?:
1612                 bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
1613                                         BTREE_TRIGGER_norun, NULL) ?:
1614                 bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
1615                                         BTREE_TRIGGER_norun, NULL) ?:
1616                 bch2_dev_usage_remove(c, ca->dev_idx);
1617         bch_err_msg(c, ret, "removing dev alloc info");
1618         return ret;
1619 }
1620 
1621 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1622 {
1623         struct bch_member *m;
1624         unsigned dev_idx = ca->dev_idx, data;
1625         int ret;
1626 
1627         down_write(&c->state_lock);
1628 
1629         /*
1630          * We consume a reference to ca->ref, regardless of whether we succeed
1631          * or fail:
1632          */
1633         bch2_dev_put(ca);
1634 
1635         if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
1636                 bch_err(ca, "Cannot remove without losing data");
1637                 ret = -BCH_ERR_device_state_not_allowed;
1638                 goto err;
1639         }
1640 
1641         __bch2_dev_read_only(c, ca);
1642 
1643         ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
1644         bch_err_msg(ca, ret, "bch2_dev_data_drop()");
1645         if (ret)
1646                 goto err;
1647 
1648         ret = bch2_dev_remove_alloc(c, ca);
1649         bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
1650         if (ret)
1651                 goto err;
1652 
1653         /*
1654          * We need to flush the entire journal to get rid of keys that reference
1655          * the device being removed before removing the superblock entry
1656          */
1657         bch2_journal_flush_all_pins(&c->journal);
1658 
1659         /*
1660          * this is really just needed for the bch2_replicas_gc_(start|end)
1661          * calls, and could be cleaned up:
1662          */
1663         ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
1664         bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
1665         if (ret)
1666                 goto err;
1667 
1668         ret = bch2_journal_flush(&c->journal);
1669         bch_err_msg(ca, ret, "bch2_journal_flush()");
1670         if (ret)
1671                 goto err;
1672 
1673         ret = bch2_replicas_gc2(c);
1674         bch_err_msg(ca, ret, "bch2_replicas_gc2()");
1675         if (ret)
1676                 goto err;
1677 
1678         data = bch2_dev_has_data(c, ca);
1679         if (data) {
1680                 struct printbuf data_has = PRINTBUF;
1681 
1682                 prt_bitflags(&data_has, __bch2_data_types, data);
1683                 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
1684                 printbuf_exit(&data_has);
1685                 ret = -EBUSY;
1686                 goto err;
1687         }
1688 
1689         __bch2_dev_offline(c, ca);
1690 
1691         mutex_lock(&c->sb_lock);
1692         rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
1693         mutex_unlock(&c->sb_lock);
1694 
1695 #ifndef CONFIG_BCACHEFS_DEBUG
1696         percpu_ref_kill(&ca->ref);
1697 #else
1698         ca->dying = true;
1699         bch2_dev_put(ca);
1700 #endif
1701         wait_for_completion(&ca->ref_completion);
1702 
1703         bch2_dev_free(ca);
1704 
1705         /*
1706          * Free this device's slot in the bch_member array - all pointers to
1707          * this device must be gone:
1708          */
1709         mutex_lock(&c->sb_lock);
1710         m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
1711         memset(&m->uuid, 0, sizeof(m->uuid));
1712 
1713         bch2_write_super(c);
1714 
1715         mutex_unlock(&c->sb_lock);
1716         up_write(&c->state_lock);
1717         return 0;
1718 err:
1719         if (ca->mi.state == BCH_MEMBER_STATE_rw &&
1720             !percpu_ref_is_zero(&ca->io_ref))
1721                 __bch2_dev_read_write(c, ca);
1722         up_write(&c->state_lock);
1723         return ret;
1724 }
1725 
1726 /* Add new device to running filesystem: */
1727 int bch2_dev_add(struct bch_fs *c, const char *path)
1728 {
1729         struct bch_opts opts = bch2_opts_empty();
1730         struct bch_sb_handle sb;
1731         struct bch_dev *ca = NULL;
1732         struct bch_sb_field_members_v2 *mi;
1733         struct bch_member dev_mi;
1734         unsigned dev_idx, nr_devices, u64s;
1735         struct printbuf errbuf = PRINTBUF;
1736         struct printbuf label = PRINTBUF;
1737         int ret;
1738 
1739         ret = bch2_read_super(path, &opts, &sb);
1740         bch_err_msg(c, ret, "reading super");
1741         if (ret)
1742                 goto err;
1743 
1744         dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
1745 
1746         if (BCH_MEMBER_GROUP(&dev_mi)) {
1747                 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
1748                 if (label.allocation_failure) {
1749                         ret = -ENOMEM;
1750                         goto err;
1751                 }
1752         }
1753 
1754         ret = bch2_dev_may_add(sb.sb, c);
1755         if (ret)
1756                 goto err;
1757 
1758         ca = __bch2_dev_alloc(c, &dev_mi);
1759         if (!ca) {
1760                 ret = -ENOMEM;
1761                 goto err;
1762         }
1763 
1764         ret = __bch2_dev_attach_bdev(ca, &sb);
1765         if (ret)
1766                 goto err;
1767 
1768         ret = bch2_dev_journal_alloc(ca, true);
1769         bch_err_msg(c, ret, "allocating journal");
1770         if (ret)
1771                 goto err;
1772 
1773         down_write(&c->state_lock);
1774         mutex_lock(&c->sb_lock);
1775 
1776         ret = bch2_sb_from_fs(c, ca);
1777         bch_err_msg(c, ret, "setting up new superblock");
1778         if (ret)
1779                 goto err_unlock;
1780 
1781         if (dynamic_fault("bcachefs:add:no_slot"))
1782                 goto no_slot;
1783 
1784         if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
1785                 dev_idx = c->sb.nr_devices;
1786                 goto have_slot;
1787         }
1788 
1789         int best = -1;
1790         u64 best_last_mount = 0;
1791         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
1792                 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
1793                 if (bch2_member_alive(&m))
1794                         continue;
1795 
1796                 u64 last_mount = le64_to_cpu(m.last_mount);
1797                 if (best < 0 || last_mount < best_last_mount) {
1798                         best = dev_idx;
1799                         best_last_mount = last_mount;
1800                 }
1801         }
1802         if (best >= 0) {
1803                 dev_idx = best;
1804                 goto have_slot;
1805         }
1806 no_slot:
1807         ret = -BCH_ERR_ENOSPC_sb_members;
1808         bch_err_msg(c, ret, "setting up new superblock");
1809         goto err_unlock;
1810 
1811 have_slot:
1812         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1813 
1814         mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
1815         u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
1816                             le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
1817 
1818         mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
1819         if (!mi) {
1820                 ret = -BCH_ERR_ENOSPC_sb_members;
1821                 bch_err_msg(c, ret, "setting up new superblock");
1822                 goto err_unlock;
1823         }
1824         struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
1825 
1826         /* success: */
1827 
1828         *m = dev_mi;
1829         m->last_mount = cpu_to_le64(ktime_get_real_seconds());
1830         c->disk_sb.sb->nr_devices       = nr_devices;
1831 
1832         ca->disk_sb.sb->dev_idx = dev_idx;
1833         bch2_dev_attach(c, ca, dev_idx);
1834 
1835         if (BCH_MEMBER_GROUP(&dev_mi)) {
1836                 ret = __bch2_dev_group_set(c, ca, label.buf);
1837                 bch_err_msg(c, ret, "creating new label");
1838                 if (ret)
1839                         goto err_unlock;
1840         }
1841 
1842         bch2_write_super(c);
1843         mutex_unlock(&c->sb_lock);
1844 
1845         ret = bch2_dev_usage_init(ca, false);
1846         if (ret)
1847                 goto err_late;
1848 
1849         ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
1850         bch_err_msg(ca, ret, "marking new superblock");
1851         if (ret)
1852                 goto err_late;
1853 
1854         ret = bch2_fs_freespace_init(c);
1855         bch_err_msg(ca, ret, "initializing free space");
1856         if (ret)
1857                 goto err_late;
1858 
1859         ca->new_fs_bucket_idx = 0;
1860 
1861         if (ca->mi.state == BCH_MEMBER_STATE_rw)
1862                 __bch2_dev_read_write(c, ca);
1863 
1864         up_write(&c->state_lock);
1865         return 0;
1866 
1867 err_unlock:
1868         mutex_unlock(&c->sb_lock);
1869         up_write(&c->state_lock);
1870 err:
1871         if (ca)
1872                 bch2_dev_free(ca);
1873         bch2_free_super(&sb);
1874         printbuf_exit(&label);
1875         printbuf_exit(&errbuf);
1876         bch_err_fn(c, ret);
1877         return ret;
1878 err_late:
1879         up_write(&c->state_lock);
1880         ca = NULL;
1881         goto err;
1882 }
1883 
1884 /* Hot add existing device to running filesystem: */
1885 int bch2_dev_online(struct bch_fs *c, const char *path)
1886 {
1887         struct bch_opts opts = bch2_opts_empty();
1888         struct bch_sb_handle sb = { NULL };
1889         struct bch_dev *ca;
1890         unsigned dev_idx;
1891         int ret;
1892 
1893         down_write(&c->state_lock);
1894 
1895         ret = bch2_read_super(path, &opts, &sb);
1896         if (ret) {
1897                 up_write(&c->state_lock);
1898                 return ret;
1899         }
1900 
1901         dev_idx = sb.sb->dev_idx;
1902 
1903         ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
1904         bch_err_msg(c, ret, "bringing %s online", path);
1905         if (ret)
1906                 goto err;
1907 
1908         ret = bch2_dev_attach_bdev(c, &sb);
1909         if (ret)
1910                 goto err;
1911 
1912         ca = bch2_dev_locked(c, dev_idx);
1913 
1914         ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
1915         bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
1916         if (ret)
1917                 goto err;
1918 
1919         if (ca->mi.state == BCH_MEMBER_STATE_rw)
1920                 __bch2_dev_read_write(c, ca);
1921 
1922         if (!ca->mi.freespace_initialized) {
1923                 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
1924                 bch_err_msg(ca, ret, "initializing free space");
1925                 if (ret)
1926                         goto err;
1927         }
1928 
1929         if (!ca->journal.nr) {
1930                 ret = bch2_dev_journal_alloc(ca, false);
1931                 bch_err_msg(ca, ret, "allocating journal");
1932                 if (ret)
1933                         goto err;
1934         }
1935 
1936         mutex_lock(&c->sb_lock);
1937         bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
1938                 cpu_to_le64(ktime_get_real_seconds());
1939         bch2_write_super(c);
1940         mutex_unlock(&c->sb_lock);
1941 
1942         up_write(&c->state_lock);
1943         return 0;
1944 err:
1945         up_write(&c->state_lock);
1946         bch2_free_super(&sb);
1947         return ret;
1948 }
1949 
1950 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
1951 {
1952         down_write(&c->state_lock);
1953 
1954         if (!bch2_dev_is_online(ca)) {
1955                 bch_err(ca, "Already offline");
1956                 up_write(&c->state_lock);
1957                 return 0;
1958         }
1959 
1960         if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
1961                 bch_err(ca, "Cannot offline required disk");
1962                 up_write(&c->state_lock);
1963                 return -BCH_ERR_device_state_not_allowed;
1964         }
1965 
1966         __bch2_dev_offline(c, ca);
1967 
1968         up_write(&c->state_lock);
1969         return 0;
1970 }
1971 
1972 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1973 {
1974         struct bch_member *m;
1975         u64 old_nbuckets;
1976         int ret = 0;
1977 
1978         down_write(&c->state_lock);
1979         old_nbuckets = ca->mi.nbuckets;
1980 
1981         if (nbuckets < ca->mi.nbuckets) {
1982                 bch_err(ca, "Cannot shrink yet");
1983                 ret = -EINVAL;
1984                 goto err;
1985         }
1986 
1987         if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
1988                 bch_err(ca, "New device size too big (%llu greater than max %u)",
1989                         nbuckets, BCH_MEMBER_NBUCKETS_MAX);
1990                 ret = -BCH_ERR_device_size_too_big;
1991                 goto err;
1992         }
1993 
1994         if (bch2_dev_is_online(ca) &&
1995             get_capacity(ca->disk_sb.bdev->bd_disk) <
1996             ca->mi.bucket_size * nbuckets) {
1997                 bch_err(ca, "New size larger than device");
1998                 ret = -BCH_ERR_device_size_too_small;
1999                 goto err;
2000         }
2001 
2002         ret = bch2_dev_buckets_resize(c, ca, nbuckets);
2003         bch_err_msg(ca, ret, "resizing buckets");
2004         if (ret)
2005                 goto err;
2006 
2007         ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
2008         if (ret)
2009                 goto err;
2010 
2011         mutex_lock(&c->sb_lock);
2012         m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2013         m->nbuckets = cpu_to_le64(nbuckets);
2014 
2015         bch2_write_super(c);
2016         mutex_unlock(&c->sb_lock);
2017 
2018         if (ca->mi.freespace_initialized) {
2019                 struct disk_accounting_pos acc = {
2020                         .type = BCH_DISK_ACCOUNTING_dev_data_type,
2021                         .dev_data_type.dev = ca->dev_idx,
2022                         .dev_data_type.data_type = BCH_DATA_free,
2023                 };
2024                 u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
2025 
2026                 ret   = bch2_trans_do(ca->fs, NULL, NULL, 0,
2027                                 bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
2028                         bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
2029                 if (ret)
2030                         goto err;
2031         }
2032 
2033         bch2_recalc_capacity(c);
2034 err:
2035         up_write(&c->state_lock);
2036         return ret;
2037 }
2038 
2039 /* return with ref on ca->ref: */
2040 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
2041 {
2042         if (!strncmp(name, "/dev/", strlen("/dev/")))
2043                 name += strlen("/dev/");
2044 
2045         for_each_member_device(c, ca)
2046                 if (!strcmp(name, ca->name))
2047                         return ca;
2048         return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
2049 }
2050 
2051 /* Filesystem open: */
2052 
2053 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
2054 {
2055         return  cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
2056                 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
2057 }
2058 
2059 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
2060                             struct bch_opts opts)
2061 {
2062         DARRAY(struct bch_sb_handle) sbs = { 0 };
2063         struct bch_fs *c = NULL;
2064         struct bch_sb_handle *best = NULL;
2065         struct printbuf errbuf = PRINTBUF;
2066         int ret = 0;
2067 
2068         if (!try_module_get(THIS_MODULE))
2069                 return ERR_PTR(-ENODEV);
2070 
2071         if (!nr_devices) {
2072                 ret = -EINVAL;
2073                 goto err;
2074         }
2075 
2076         ret = darray_make_room(&sbs, nr_devices);
2077         if (ret)
2078                 goto err;
2079 
2080         for (unsigned i = 0; i < nr_devices; i++) {
2081                 struct bch_sb_handle sb = { NULL };
2082 
2083                 ret = bch2_read_super(devices[i], &opts, &sb);
2084                 if (ret)
2085                         goto err;
2086 
2087                 BUG_ON(darray_push(&sbs, sb));
2088         }
2089 
2090         if (opts.nochanges && !opts.read_only) {
2091                 ret = -BCH_ERR_erofs_nochanges;
2092                 goto err_print;
2093         }
2094 
2095         darray_for_each(sbs, sb)
2096                 if (!best || sb_cmp(sb->sb, best->sb) > 0)
2097                         best = sb;
2098 
2099         darray_for_each_reverse(sbs, sb) {
2100                 ret = bch2_dev_in_fs(best, sb, &opts);
2101 
2102                 if (ret == -BCH_ERR_device_has_been_removed ||
2103                     ret == -BCH_ERR_device_splitbrain) {
2104                         bch2_free_super(sb);
2105                         darray_remove_item(&sbs, sb);
2106                         best -= best > sb;
2107                         ret = 0;
2108                         continue;
2109                 }
2110 
2111                 if (ret)
2112                         goto err_print;
2113         }
2114 
2115         c = bch2_fs_alloc(best->sb, opts);
2116         ret = PTR_ERR_OR_ZERO(c);
2117         if (ret)
2118                 goto err;
2119 
2120         down_write(&c->state_lock);
2121         darray_for_each(sbs, sb) {
2122                 ret = bch2_dev_attach_bdev(c, sb);
2123                 if (ret) {
2124                         up_write(&c->state_lock);
2125                         goto err;
2126                 }
2127         }
2128         up_write(&c->state_lock);
2129 
2130         if (!bch2_fs_may_start(c)) {
2131                 ret = -BCH_ERR_insufficient_devices_to_start;
2132                 goto err_print;
2133         }
2134 
2135         if (!c->opts.nostart) {
2136                 ret = bch2_fs_start(c);
2137                 if (ret)
2138                         goto err;
2139         }
2140 out:
2141         darray_for_each(sbs, sb)
2142                 bch2_free_super(sb);
2143         darray_exit(&sbs);
2144         printbuf_exit(&errbuf);
2145         module_put(THIS_MODULE);
2146         return c;
2147 err_print:
2148         pr_err("bch_fs_open err opening %s: %s",
2149                devices[0], bch2_err_str(ret));
2150 err:
2151         if (!IS_ERR_OR_NULL(c))
2152                 bch2_fs_stop(c);
2153         c = ERR_PTR(ret);
2154         goto out;
2155 }
2156 
2157 /* Global interfaces/init */
2158 
2159 static void bcachefs_exit(void)
2160 {
2161         bch2_debug_exit();
2162         bch2_vfs_exit();
2163         bch2_chardev_exit();
2164         bch2_btree_key_cache_exit();
2165         if (bcachefs_kset)
2166                 kset_unregister(bcachefs_kset);
2167 }
2168 
2169 static int __init bcachefs_init(void)
2170 {
2171         bch2_bkey_pack_test();
2172 
2173         if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
2174             bch2_btree_key_cache_init() ||
2175             bch2_chardev_init() ||
2176             bch2_vfs_init() ||
2177             bch2_debug_init())
2178                 goto err;
2179 
2180         return 0;
2181 err:
2182         bcachefs_exit();
2183         return -ENOMEM;
2184 }
2185 
2186 #define BCH_DEBUG_PARAM(name, description)                      \
2187         bool bch2_##name;                                       \
2188         module_param_named(name, bch2_##name, bool, 0644);      \
2189         MODULE_PARM_DESC(name, description);
2190 BCH_DEBUG_PARAMS()
2191 #undef BCH_DEBUG_PARAM
2192 
2193 __maybe_unused
2194 static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
2195 module_param_named(version, bch2_metadata_version, uint, 0400);
2196 
2197 module_exit(bcachefs_exit);
2198 module_init(bcachefs_init);
2199 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php