~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/btree_update_interior.h

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 #ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
  3 #define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
  4 
  5 #include "btree_cache.h"
  6 #include "btree_locking.h"
  7 #include "btree_update.h"
  8 
  9 #define BTREE_UPDATE_NODES_MAX          ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
 10 
 11 #define BTREE_UPDATE_JOURNAL_RES        (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
 12 
 13 int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
 14 
 15 #define BTREE_UPDATE_MODES()    \
 16         x(none)                 \
 17         x(node)                 \
 18         x(root)                 \
 19         x(update)
 20 
 21 enum btree_update_mode {
 22 #define x(n)    BTREE_UPDATE_##n,
 23         BTREE_UPDATE_MODES()
 24 #undef x
 25 };
 26 
 27 /*
 28  * Tracks an in progress split/rewrite of a btree node and the update to the
 29  * parent node:
 30  *
 31  * When we split/rewrite a node, we do all the updates in memory without
 32  * waiting for any writes to complete - we allocate the new node(s) and update
 33  * the parent node, possibly recursively up to the root.
 34  *
 35  * The end result is that we have one or more new nodes being written -
 36  * possibly several, if there were multiple splits - and then a write (updating
 37  * an interior node) which will make all these new nodes visible.
 38  *
 39  * Additionally, as we split/rewrite nodes we free the old nodes - but the old
 40  * nodes can't be freed (their space on disk can't be reclaimed) until the
 41  * update to the interior node that makes the new node visible completes -
 42  * until then, the old nodes are still reachable on disk.
 43  *
 44  */
 45 struct btree_update {
 46         struct closure                  cl;
 47         struct bch_fs                   *c;
 48         u64                             start_time;
 49         unsigned long                   ip_started;
 50 
 51         struct list_head                list;
 52         struct list_head                unwritten_list;
 53 
 54         enum btree_update_mode          mode;
 55         enum bch_trans_commit_flags     flags;
 56         unsigned                        nodes_written:1;
 57         unsigned                        took_gc_lock:1;
 58 
 59         enum btree_id                   btree_id;
 60         unsigned                        update_level_start;
 61         unsigned                        update_level_end;
 62 
 63         struct disk_reservation         disk_res;
 64 
 65         /*
 66          * BTREE_UPDATE_node:
 67          * The update that made the new nodes visible was a regular update to an
 68          * existing interior node - @b. We can't write out the update to @b
 69          * until the new nodes we created are finished writing, so we block @b
 70          * from writing by putting this btree_interior update on the
 71          * @b->write_blocked list with @write_blocked_list:
 72          */
 73         struct btree                    *b;
 74         struct list_head                write_blocked_list;
 75 
 76         /*
 77          * We may be freeing nodes that were dirty, and thus had journal entries
 78          * pinned: we need to transfer the oldest of those pins to the
 79          * btree_update operation, and release it when the new node(s)
 80          * are all persistent and reachable:
 81          */
 82         struct journal_entry_pin        journal;
 83 
 84         /* Preallocated nodes we reserve when we start the update: */
 85         struct prealloc_nodes {
 86                 struct btree            *b[BTREE_UPDATE_NODES_MAX];
 87                 unsigned                nr;
 88         }                               prealloc_nodes[2];
 89 
 90         /* Nodes being freed: */
 91         struct keylist                  old_keys;
 92         u64                             _old_keys[BTREE_UPDATE_NODES_MAX *
 93                                                   BKEY_BTREE_PTR_U64s_MAX];
 94 
 95         /* Nodes being added: */
 96         struct keylist                  new_keys;
 97         u64                             _new_keys[BTREE_UPDATE_NODES_MAX *
 98                                                   BKEY_BTREE_PTR_U64s_MAX];
 99 
100         /* New nodes, that will be made reachable by this update: */
101         struct btree                    *new_nodes[BTREE_UPDATE_NODES_MAX];
102         unsigned                        nr_new_nodes;
103 
104         struct btree                    *old_nodes[BTREE_UPDATE_NODES_MAX];
105         __le64                          old_nodes_seq[BTREE_UPDATE_NODES_MAX];
106         unsigned                        nr_old_nodes;
107 
108         open_bucket_idx_t               open_buckets[BTREE_UPDATE_NODES_MAX *
109                                                      BCH_REPLICAS_MAX];
110         open_bucket_idx_t               nr_open_buckets;
111 
112         unsigned                        journal_u64s;
113         u64                             journal_entries[BTREE_UPDATE_JOURNAL_RES];
114 
115         /* Only here to reduce stack usage on recursive splits: */
116         struct keylist                  parent_keys;
117         /*
118          * Enough room for btree_split's keys without realloc - btree node
119          * pointers never have crc/compression info, so we only need to acount
120          * for the pointers for three keys
121          */
122         u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
123 };
124 
125 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
126                                                   struct btree_trans *,
127                                                   struct btree *,
128                                                   struct bkey_format);
129 
130 int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
131 
132 int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
133 
134 int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
135                                   unsigned, unsigned, enum btree_node_sibling);
136 
137 static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
138                                         btree_path_idx_t path_idx,
139                                         unsigned level, unsigned flags,
140                                         enum btree_node_sibling sib)
141 {
142         struct btree_path *path = trans->paths + path_idx;
143         struct btree *b;
144 
145         EBUG_ON(!btree_node_locked(path, level));
146 
147         if (bch2_btree_node_merging_disabled)
148                 return 0;
149 
150         b = path->l[level].b;
151         if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
152                 return 0;
153 
154         return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
155 }
156 
157 static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
158                                               btree_path_idx_t path,
159                                               unsigned level,
160                                               unsigned flags)
161 {
162         return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
163                                                     btree_prev_sib) ?:
164                 bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
165                                                     btree_next_sib);
166 }
167 
168 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
169                             struct btree *, unsigned);
170 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
171 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
172                                struct btree *, struct bkey_i *,
173                                unsigned, bool);
174 int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
175                                         struct bkey_i *, unsigned, bool);
176 
177 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
178 
179 int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
180 void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
181 
182 static inline unsigned btree_update_reserve_required(struct bch_fs *c,
183                                                      struct btree *b)
184 {
185         unsigned depth = btree_node_root(c, b)->c.level + 1;
186 
187         /*
188          * Number of nodes we might have to allocate in a worst case btree
189          * split operation - we split all the way up to the root, then allocate
190          * a new root, unless we're already at max depth:
191          */
192         if (depth < BTREE_MAX_DEPTH)
193                 return (depth - b->c.level) * 2 + 1;
194         else
195                 return (depth - b->c.level) * 2 - 1;
196 }
197 
198 static inline void btree_node_reset_sib_u64s(struct btree *b)
199 {
200         b->sib_u64s[0] = b->nr.live_u64s;
201         b->sib_u64s[1] = b->nr.live_u64s;
202 }
203 
204 static inline void *btree_data_end(struct btree *b)
205 {
206         return (void *) b->data + btree_buf_bytes(b);
207 }
208 
209 static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
210 {
211         return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
212 }
213 
214 static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
215 {
216         return btree_data_end(b);
217 }
218 
219 static inline void *write_block(struct btree *b)
220 {
221         return (void *) b->data + (b->written << 9);
222 }
223 
224 static inline bool __btree_addr_written(struct btree *b, void *p)
225 {
226         return p < write_block(b);
227 }
228 
229 static inline bool bset_written(struct btree *b, struct bset *i)
230 {
231         return __btree_addr_written(b, i);
232 }
233 
234 static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
235 {
236         return __btree_addr_written(b, k);
237 }
238 
239 static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
240 {
241         ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
242                 b->whiteout_u64s;
243         ssize_t total = btree_buf_bytes(b) >> 3;
244 
245         /* Always leave one extra u64 for bch2_varint_decode: */
246         used++;
247 
248         return total - used;
249 }
250 
251 static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
252 {
253         ssize_t remaining = __bch2_btree_u64s_remaining(b,
254                                 btree_bkey_last(b, bset_tree_last(b)));
255 
256         BUG_ON(remaining < 0);
257 
258         if (bset_written(b, btree_bset_last(b)))
259                 return 0;
260 
261         return remaining;
262 }
263 
264 #define BTREE_WRITE_SET_U64s_BITS       9
265 
266 static inline unsigned btree_write_set_buffer(struct btree *b)
267 {
268         /*
269          * Could buffer up larger amounts of keys for btrees with larger keys,
270          * pending benchmarking:
271          */
272         return 8 << BTREE_WRITE_SET_U64s_BITS;
273 }
274 
275 static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
276 {
277         struct bset_tree *t = bset_tree_last(b);
278         struct btree_node_entry *bne = max(write_block(b),
279                         (void *) btree_bkey_last(b, bset_tree_last(b)));
280         ssize_t remaining_space =
281                 __bch2_btree_u64s_remaining(b, bne->keys.start);
282 
283         if (unlikely(bset_written(b, bset(b, t)))) {
284                 if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
285                         return bne;
286         } else {
287                 if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
288                     remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
289                         return bne;
290         }
291 
292         return NULL;
293 }
294 
295 static inline void push_whiteout(struct btree *b, struct bpos pos)
296 {
297         struct bkey_packed k;
298 
299         BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
300         EBUG_ON(btree_node_just_written(b));
301 
302         if (!bkey_pack_pos(&k, pos, b)) {
303                 struct bkey *u = (void *) &k;
304 
305                 bkey_init(u);
306                 u->p = pos;
307         }
308 
309         k.needs_whiteout = true;
310 
311         b->whiteout_u64s += k.u64s;
312         bkey_p_copy(unwritten_whiteouts_start(b), &k);
313 }
314 
315 /*
316  * write lock must be held on @b (else the dirty bset that we were going to
317  * insert into could be written out from under us)
318  */
319 static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
320 {
321         if (unlikely(btree_node_need_rewrite(b)))
322                 return false;
323 
324         return u64s <= bch2_btree_keys_u64s_remaining(b);
325 }
326 
327 void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
328 
329 bool bch2_btree_interior_updates_flush(struct bch_fs *);
330 
331 void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
332 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
333                                         struct jset_entry *, unsigned long);
334 
335 void bch2_do_pending_node_rewrites(struct bch_fs *);
336 void bch2_free_pending_node_rewrites(struct bch_fs *);
337 
338 void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
339 
340 void bch2_fs_btree_interior_update_exit(struct bch_fs *);
341 void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
342 int bch2_fs_btree_interior_update_init(struct bch_fs *);
343 
344 #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
345 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php