~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/bcachefs/io_write.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /fs/bcachefs/io_write.c (Version linux-6.12-rc7) and /fs/bcachefs/io_write.c (Version linux-6.5.13)


  1 // SPDX-License-Identifier: GPL-2.0                 1 
  2 /*                                                
  3  * Copyright 2010, 2011 Kent Overstreet <kent.    
  4  * Copyright 2012 Google, Inc.                    
  5  */                                               
  6                                                   
  7 #include "bcachefs.h"                             
  8 #include "alloc_foreground.h"                     
  9 #include "bkey_buf.h"                             
 10 #include "bset.h"                                 
 11 #include "btree_update.h"                         
 12 #include "buckets.h"                              
 13 #include "checksum.h"                             
 14 #include "clock.h"                                
 15 #include "compress.h"                             
 16 #include "debug.h"                                
 17 #include "ec.h"                                   
 18 #include "error.h"                                
 19 #include "extent_update.h"                        
 20 #include "inode.h"                                
 21 #include "io_write.h"                             
 22 #include "journal.h"                              
 23 #include "keylist.h"                              
 24 #include "move.h"                                 
 25 #include "nocow_locking.h"                        
 26 #include "rebalance.h"                            
 27 #include "subvolume.h"                            
 28 #include "super.h"                                
 29 #include "super-io.h"                             
 30 #include "trace.h"                                
 31                                                   
 32 #include <linux/blkdev.h>                         
 33 #include <linux/prefetch.h>                       
 34 #include <linux/random.h>                         
 35 #include <linux/sched/mm.h>                       
 36                                                   
 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT           
 38                                                   
 39 static inline void bch2_congested_acct(struct     
 40                                        u64 now    
 41 {                                                 
 42         u64 latency_capable =                     
 43                 ca->io_latency[rw].quantiles.e    
 44         /* ideally we'd be taking into account    
 45         u64 latency_threshold = latency_capabl    
 46         s64 latency_over = io_latency - latenc    
 47                                                   
 48         if (latency_threshold && latency_over     
 49                 /*                                
 50                  * bump up congested by approx    
 51                  * latency_threshold - we don'    
 52                  * bother with the divide:        
 53                  */                               
 54                 if (atomic_read(&ca->congested    
 55                         atomic_add(latency_ove    
 56                                    max_t(int,     
 57                                    &ca->conges    
 58                                                   
 59                 ca->congested_last = now;         
 60         } else if (atomic_read(&ca->congested)    
 61                 atomic_dec(&ca->congested);       
 62         }                                         
 63 }                                                 
 64                                                   
 65 void bch2_latency_acct(struct bch_dev *ca, u64    
 66 {                                                 
 67         atomic64_t *latency = &ca->cur_latency    
 68         u64 now = local_clock();                  
 69         u64 io_latency = time_after64(now, sub    
 70                 ? now - submit_time               
 71                 : 0;                              
 72         u64 old, new;                             
 73                                                   
 74         old = atomic64_read(latency);             
 75         do {                                      
 76                 /*                                
 77                  * If the io latency was reaso    
 78                  * latency, skip doing the upd    
 79                  * the time:                      
 80                  */                               
 81                 if (abs((int) (old - io_latenc    
 82                     now & ~(~0U << 5))            
 83                         break;                    
 84                                                   
 85                 new = ewma_add(old, io_latency    
 86         } while (!atomic64_try_cmpxchg(latency    
 87                                                   
 88         bch2_congested_acct(ca, io_latency, no    
 89                                                   
 90         __bch2_time_stats_update(&ca->io_laten    
 91 }                                                 
 92                                                   
 93 #endif                                            
 94                                                   
 95 /* Allocate, free from mempool: */                
 96                                                   
 97 void bch2_bio_free_pages_pool(struct bch_fs *c    
 98 {                                                 
 99         struct bvec_iter_all iter;                
100         struct bio_vec *bv;                       
101                                                   
102         bio_for_each_segment_all(bv, bio, iter    
103                 if (bv->bv_page != ZERO_PAGE(0    
104                         mempool_free(bv->bv_pa    
105         bio->bi_vcnt = 0;                         
106 }                                                 
107                                                   
108 static struct page *__bio_alloc_page_pool(stru    
109 {                                                 
110         struct page *page;                        
111                                                   
112         if (likely(!*using_mempool)) {            
113                 page = alloc_page(GFP_NOFS);      
114                 if (unlikely(!page)) {            
115                         mutex_lock(&c->bio_bou    
116                         *using_mempool = true;    
117                         goto pool_alloc;          
118                                                   
119                 }                                 
120         } else {                                  
121 pool_alloc:                                       
122                 page = mempool_alloc(&c->bio_b    
123         }                                         
124                                                   
125         return page;                              
126 }                                                 
127                                                   
128 void bch2_bio_alloc_pages_pool(struct bch_fs *    
129                                size_t size)       
130 {                                                 
131         bool using_mempool = false;               
132                                                   
133         while (size) {                            
134                 struct page *page = __bio_allo    
135                 unsigned len = min_t(size_t, P    
136                                                   
137                 BUG_ON(!bio_add_page(bio, page    
138                 size -= len;                      
139         }                                         
140                                                   
141         if (using_mempool)                        
142                 mutex_unlock(&c->bio_bounce_pa    
143 }                                                 
144                                                   
145 /* Extent update path: */                         
146                                                   
147 int bch2_sum_sector_overwrites(struct btree_tr    
148                                struct btree_it    
149                                struct bkey_i *    
150                                bool *usage_inc    
151                                s64 *i_sectors_    
152                                s64 *disk_secto    
153 {                                                 
154         struct bch_fs *c = trans->c;              
155         struct btree_iter iter;                   
156         struct bkey_s_c old;                      
157         unsigned new_replicas = bch2_bkey_repl    
158         bool new_compressed = bch2_bkey_sector    
159         int ret = 0;                              
160                                                   
161         *usage_increasing       = false;          
162         *i_sectors_delta        = 0;              
163         *disk_sectors_delta     = 0;              
164                                                   
165         bch2_trans_copy_iter(&iter, extent_ite    
166                                                   
167         for_each_btree_key_upto_continue_nores    
168                                 new->k.p, BTRE    
169                 s64 sectors = min(new->k.p.off    
170                         max(bkey_start_offset(    
171                             bkey_start_offset(    
172                                                   
173                 *i_sectors_delta += sectors *     
174                         (bkey_extent_is_alloca    
175                          bkey_extent_is_alloca    
176                                                   
177                 *disk_sectors_delta += sectors    
178                 *disk_sectors_delta -= new->k.    
179                         ? sectors * bch2_bkey_    
180                         : 0;                      
181                                                   
182                 if (!*usage_increasing &&         
183                     (new->k.p.snapshot != old.    
184                      new_replicas > bch2_bkey_    
185                      (!new_compressed && bch2_    
186                         *usage_increasing = tr    
187                                                   
188                 if (bkey_ge(old.k->p, new->k.p    
189                         break;                    
190         }                                         
191                                                   
192         bch2_trans_iter_exit(trans, &iter);       
193         return ret;                               
194 }                                                 
195                                                   
196 static inline int bch2_extent_update_i_size_se    
197                                                   
198                                                   
199                                                   
200 {                                                 
201         /*                                        
202          * Crazy performance optimization:        
203          * Every extent update needs to also u    
204          * will set bi->journal_seq to the jou    
205          * transaction - for fsync.               
206          *                                        
207          * But if that's the only reason we're    
208          * updating bi_size or bi_sectors), th    
209          * to be journalled - if we crash, the    
210          * lost, but that's fine.                 
211          */                                       
212         unsigned inode_update_flags = BTREE_UP    
213                                                   
214         struct btree_iter iter;                   
215         struct bkey_s_c k = bch2_bkey_get_iter    
216                               SPOS(0,             
217                                    extent_iter    
218                                    extent_iter    
219                               BTREE_ITER_cache    
220         int ret = bkey_err(k);                    
221         if (unlikely(ret))                        
222                 return ret;                       
223                                                   
224         /*                                        
225          * varint_decode_fast(), in the inode     
226          * bytes past the end of the buffer:      
227          */                                       
228         struct bkey_i *k_mut = bch2_trans_kmal    
229         ret = PTR_ERR_OR_ZERO(k_mut);             
230         if (unlikely(ret))                        
231                 goto err;                         
232                                                   
233         bkey_reassemble(k_mut, k);                
234                                                   
235         if (unlikely(k_mut->k.type != KEY_TYPE    
236                 k_mut = bch2_inode_to_v3(trans    
237                 ret = PTR_ERR_OR_ZERO(k_mut);     
238                 if (unlikely(ret))                
239                         goto err;                 
240         }                                         
241                                                   
242         struct bkey_i_inode_v3 *inode = bkey_i    
243                                                   
244         if (!(le64_to_cpu(inode->v.bi_flags) &    
245             new_i_size > le64_to_cpu(inode->v.    
246                 inode->v.bi_size = cpu_to_le64    
247                 inode_update_flags = 0;           
248         }                                         
249                                                   
250         if (i_sectors_delta) {                    
251                 le64_add_cpu(&inode->v.bi_sect    
252                 inode_update_flags = 0;           
253         }                                         
254                                                   
255         if (inode->k.p.snapshot != iter.snapsh    
256                 inode->k.p.snapshot = iter.sna    
257                 inode_update_flags = 0;           
258         }                                         
259                                                   
260         ret = bch2_trans_update(trans, &iter,     
261                                 BTREE_UPDATE_i    
262                                 inode_update_f    
263 err:                                              
264         bch2_trans_iter_exit(trans, &iter);       
265         return ret;                               
266 }                                                 
267                                                   
268 int bch2_extent_update(struct btree_trans *tra    
269                        subvol_inum inum,          
270                        struct btree_iter *iter    
271                        struct bkey_i *k,          
272                        struct disk_reservation    
273                        u64 new_i_size,            
274                        s64 *i_sectors_delta_to    
275                        bool check_enospc)         
276 {                                                 
277         struct bpos next_pos;                     
278         bool usage_increasing;                    
279         s64 i_sectors_delta = 0, disk_sectors_    
280         int ret;                                  
281                                                   
282         /*                                        
283          * This traverses us the iterator with    
284          * search_key() (which is pos + 1 for     
285          * path already traversed at iter->pos    
286          * bch2_trans_extent_update() will use    
287          */                                       
288         ret = __bch2_btree_iter_traverse(iter)    
289         if (ret)                                  
290                 return ret;                       
291                                                   
292         ret = bch2_extent_trim_atomic(trans, i    
293         if (ret)                                  
294                 return ret;                       
295                                                   
296         next_pos = k->k.p;                        
297                                                   
298         ret = bch2_sum_sector_overwrites(trans    
299                         &usage_increasing,        
300                         &i_sectors_delta,         
301                         &disk_sectors_delta);     
302         if (ret)                                  
303                 return ret;                       
304                                                   
305         if (disk_res &&                           
306             disk_sectors_delta > (s64) disk_re    
307                 ret = bch2_disk_reservation_ad    
308                                         disk_s    
309                                         !check    
310                                         ? BCH_    
311                 if (ret)                          
312                         return ret;               
313         }                                         
314                                                   
315         /*                                        
316          * Note:                                  
317          * We always have to do an inode updat    
318          * aren't changing - for fsync to work    
319          * inode->bi_journal_seq which is upda    
320          */                                       
321         ret =   bch2_extent_update_i_size_sect    
322                                                   
323                                                   
324                 bch2_trans_update(trans, iter,    
325                 bch2_trans_commit(trans, disk_    
326                                 BCH_TRANS_COMM    
327                                 BCH_TRANS_COMM    
328         if (unlikely(ret))                        
329                 return ret;                       
330                                                   
331         if (i_sectors_delta_total)                
332                 *i_sectors_delta_total += i_se    
333         bch2_btree_iter_set_pos(iter, next_pos    
334         return 0;                                 
335 }                                                 
336                                                   
337 static int bch2_write_index_default(struct bch    
338 {                                                 
339         struct bch_fs *c = op->c;                 
340         struct bkey_buf sk;                       
341         struct keylist *keys = &op->insert_key    
342         struct bkey_i *k = bch2_keylist_front(    
343         struct btree_trans *trans = bch2_trans    
344         struct btree_iter iter;                   
345         subvol_inum inum = {                      
346                 .subvol = op->subvol,             
347                 .inum   = k->k.p.inode,           
348         };                                        
349         int ret;                                  
350                                                   
351         BUG_ON(!inum.subvol);                     
352                                                   
353         bch2_bkey_buf_init(&sk);                  
354                                                   
355         do {                                      
356                 bch2_trans_begin(trans);          
357                                                   
358                 k = bch2_keylist_front(keys);     
359                 bch2_bkey_buf_copy(&sk, c, k);    
360                                                   
361                 ret = bch2_subvolume_get_snaps    
362                                                   
363                 if (bch2_err_matches(ret, BCH_    
364                         continue;                 
365                 if (ret)                          
366                         break;                    
367                                                   
368                 bch2_trans_iter_init(trans, &i    
369                                      bkey_star    
370                                      BTREE_ITE    
371                                                   
372                 ret =   bch2_bkey_set_needs_re    
373                         bch2_extent_update(tra    
374                                         &op->r    
375                                         op->ne    
376                                         op->fl    
377                 bch2_trans_iter_exit(trans, &i    
378                                                   
379                 if (bch2_err_matches(ret, BCH_    
380                         continue;                 
381                 if (ret)                          
382                         break;                    
383                                                   
384                 if (bkey_ge(iter.pos, k->k.p))    
385                         bch2_keylist_pop_front    
386                 else                              
387                         bch2_cut_front(iter.po    
388         } while (!bch2_keylist_empty(keys));      
389                                                   
390         bch2_trans_put(trans);                    
391         bch2_bkey_buf_exit(&sk, c);               
392                                                   
393         return ret;                               
394 }                                                 
395                                                   
396 /* Writes */                                      
397                                                   
398 void bch2_submit_wbio_replicas(struct bch_writ    
399                                enum bch_data_t    
400                                const struct bk    
401                                bool nocow)        
402 {                                                 
403         struct bkey_ptrs_c ptrs = bch2_bkey_pt    
404         struct bch_write_bio *n;                  
405                                                   
406         BUG_ON(c->opts.nochanges);                
407                                                   
408         bkey_for_each_ptr(ptrs, ptr) {            
409                 struct bch_dev *ca = nocow        
410                         ? bch2_dev_have_ref(c,    
411                         : bch2_dev_get_ioref(c    
412                                                   
413                 if (to_entry(ptr + 1) < ptrs.e    
414                         n = to_wbio(bio_alloc_    
415                                                   
416                         n->bio.bi_end_io          
417                         n->bio.bi_private         
418                         n->parent                 
419                         n->split                  
420                         n->bounce                 
421                         n->put_bio                
422                         n->bio.bi_opf             
423                         bio_inc_remaining(&wbi    
424                 } else {                          
425                         n = wbio;                 
426                         n->split                  
427                 }                                 
428                                                   
429                 n->c                    = c;      
430                 n->dev                  = ptr-    
431                 n->have_ioref           = ca !    
432                 n->nocow                = noco    
433                 n->submit_time          = loca    
434                 n->inode_offset         = bkey    
435                 if (nocow)                        
436                         n->nocow_bucket = PTR_    
437                 n->bio.bi_iter.bi_sector = ptr    
438                                                   
439                 if (likely(n->have_ioref)) {      
440                         this_cpu_add(ca->io_do    
441                                      bio_secto    
442                                                   
443                         bio_set_dev(&n->bio, c    
444                                                   
445                         if (type != BCH_DATA_b    
446                                 bio_endio(&n->    
447                                 continue;         
448                         }                         
449                                                   
450                         submit_bio(&n->bio);      
451                 } else {                          
452                         n->bio.bi_status          
453                         bio_endio(&n->bio);       
454                 }                                 
455         }                                         
456 }                                                 
457                                                   
458 static void __bch2_write(struct bch_write_op *    
459                                                   
460 static void bch2_write_done(struct closure *cl    
461 {                                                 
462         struct bch_write_op *op = container_of    
463         struct bch_fs *c = op->c;                 
464                                                   
465         EBUG_ON(op->open_buckets.nr);             
466                                                   
467         bch2_time_stats_update(&c->times[BCH_T    
468         bch2_disk_reservation_put(c, &op->res)    
469                                                   
470         if (!(op->flags & BCH_WRITE_MOVE))        
471                 bch2_write_ref_put(c, BCH_WRIT    
472         bch2_keylist_free(&op->insert_keys, op    
473                                                   
474         EBUG_ON(cl->parent);                      
475         closure_debug_destroy(cl);                
476         if (op->end_io)                           
477                 op->end_io(op);                   
478 }                                                 
479                                                   
480 static noinline int bch2_write_drop_io_error_p    
481 {                                                 
482         struct keylist *keys = &op->insert_key    
483         struct bkey_i *src, *dst = keys->keys,    
484                                                   
485         for (src = keys->keys; src != keys->to    
486                 n = bkey_next(src);               
487                                                   
488                 if (bkey_extent_is_direct_data    
489                         bch2_bkey_drop_ptrs(bk    
490                                             te    
491                                                   
492                         if (!bch2_bkey_nr_ptrs    
493                                 return -EIO;      
494                 }                                 
495                                                   
496                 if (dst != src)                   
497                         memmove_u64s_down(dst,    
498                 dst = bkey_next(dst);             
499         }                                         
500                                                   
501         keys->top = dst;                          
502         return 0;                                 
503 }                                                 
504                                                   
505 /**                                               
506  * __bch2_write_index - after a write, update     
507  * @op:         bch_write_op to process           
508  */                                               
509 static void __bch2_write_index(struct bch_writ    
510 {                                                 
511         struct bch_fs *c = op->c;                 
512         struct keylist *keys = &op->insert_key    
513         unsigned dev;                             
514         int ret = 0;                              
515                                                   
516         if (unlikely(op->flags & BCH_WRITE_IO_    
517                 ret = bch2_write_drop_io_error    
518                 if (ret)                          
519                         goto err;                 
520         }                                         
521                                                   
522         if (!bch2_keylist_empty(keys)) {          
523                 u64 sectors_start = keylist_se    
524                                                   
525                 ret = !(op->flags & BCH_WRITE_    
526                         ? bch2_write_index_def    
527                         : bch2_data_update_ind    
528                                                   
529                 BUG_ON(bch2_err_matches(ret, B    
530                 BUG_ON(keylist_sectors(keys) &    
531                                                   
532                 op->written += sectors_start -    
533                                                   
534                 if (ret && !bch2_err_matches(r    
535                         struct bkey_i *insert     
536                                                   
537                         bch_err_inum_offset_ra    
538                                 insert->k.p.in    
539                                 "%s write erro    
540                                 op->flags & BC    
541                                 bch2_err_str(r    
542                 }                                 
543                                                   
544                 if (ret)                          
545                         goto err;                 
546         }                                         
547 out:                                              
548         /* If some a bucket wasn't written, we    
549         for_each_set_bit(dev, op->failed.d, BC    
550                 bch2_open_bucket_write_error(c    
551                                                   
552         bch2_open_buckets_put(c, &op->open_buc    
553         return;                                   
554 err:                                              
555         keys->top = keys->keys;                   
556         op->error = ret;                          
557         op->flags |= BCH_WRITE_SUBMITTED;         
558         goto out;                                 
559 }                                                 
560                                                   
561 static inline void __wp_update_state(struct wr    
562 {                                                 
563         if (state != wp->state) {                 
564                 u64 now = ktime_get_ns();         
565                                                   
566                 if (wp->last_state_change &&      
567                     time_after64(now, wp->last    
568                         wp->time[wp->state] +=    
569                 wp->state = state;                
570                 wp->last_state_change = now;      
571         }                                         
572 }                                                 
573                                                   
574 static inline void wp_update_state(struct writ    
575 {                                                 
576         enum write_point_state state;             
577                                                   
578         state = running                  ? WRI    
579                 !list_empty(&wp->writes) ? WRI    
580                                          : WRI    
581                                                   
582         __wp_update_state(wp, state);             
583 }                                                 
584                                                   
585 static CLOSURE_CALLBACK(bch2_write_index)         
586 {                                                 
587         closure_type(op, struct bch_write_op,     
588         struct write_point *wp = op->wp;          
589         struct workqueue_struct *wq = index_up    
590         unsigned long flags;                      
591                                                   
592         if ((op->flags & BCH_WRITE_SUBMITTED)     
593             (op->flags & BCH_WRITE_MOVE))         
594                 bch2_bio_free_pages_pool(op->c    
595                                                   
596         spin_lock_irqsave(&wp->writes_lock, fl    
597         if (wp->state == WRITE_POINT_waiting_i    
598                 __wp_update_state(wp, WRITE_PO    
599         list_add_tail(&op->wp_list, &wp->write    
600         spin_unlock_irqrestore (&wp->writes_lo    
601                                                   
602         queue_work(wq, &wp->index_update_work)    
603 }                                                 
604                                                   
605 static inline void bch2_write_queue(struct bch    
606 {                                                 
607         op->wp = wp;                              
608                                                   
609         if (wp->state == WRITE_POINT_stopped)     
610                 spin_lock_irq(&wp->writes_lock    
611                 __wp_update_state(wp, WRITE_PO    
612                 spin_unlock_irq(&wp->writes_lo    
613         }                                         
614 }                                                 
615                                                   
616 void bch2_write_point_do_index_updates(struct     
617 {                                                 
618         struct write_point *wp =                  
619                 container_of(work, struct writ    
620         struct bch_write_op *op;                  
621                                                   
622         while (1) {                               
623                 spin_lock_irq(&wp->writes_lock    
624                 op = list_first_entry_or_null(    
625                 if (op)                           
626                         list_del(&op->wp_list)    
627                 wp_update_state(wp, op != NULL    
628                 spin_unlock_irq(&wp->writes_lo    
629                                                   
630                 if (!op)                          
631                         break;                    
632                                                   
633                 op->flags |= BCH_WRITE_IN_WORK    
634                                                   
635                 __bch2_write_index(op);           
636                                                   
637                 if (!(op->flags & BCH_WRITE_SU    
638                         __bch2_write(op);         
639                 else                              
640                         bch2_write_done(&op->c    
641         }                                         
642 }                                                 
643                                                   
644 static void bch2_write_endio(struct bio *bio)     
645 {                                                 
646         struct closure *cl              = bio-    
647         struct bch_write_op *op         = cont    
648         struct bch_write_bio *wbio      = to_w    
649         struct bch_write_bio *parent    = wbio    
650         struct bch_fs *c                = wbio    
651         struct bch_dev *ca              = wbio    
652                 ? bch2_dev_have_ref(c, wbio->d    
653                 : NULL;                           
654                                                   
655         if (bch2_dev_inum_io_err_on(bio->bi_st    
656                                     op->pos.in    
657                                     wbio->inod    
658                                     "data writ    
659                                     bch2_blk_s    
660                 set_bit(wbio->dev, op->failed.    
661                 op->flags |= BCH_WRITE_IO_ERRO    
662         }                                         
663                                                   
664         if (wbio->nocow) {                        
665                 bch2_bucket_nocow_unlock(&c->n    
666                                          POS(c    
667                                          BUCKE    
668                 set_bit(wbio->dev, op->devs_ne    
669         }                                         
670                                                   
671         if (wbio->have_ioref) {                   
672                 bch2_latency_acct(ca, wbio->su    
673                 percpu_ref_put(&ca->io_ref);      
674         }                                         
675                                                   
676         if (wbio->bounce)                         
677                 bch2_bio_free_pages_pool(c, bi    
678                                                   
679         if (wbio->put_bio)                        
680                 bio_put(bio);                     
681                                                   
682         if (parent)                               
683                 bio_endio(&parent->bio);          
684         else                                      
685                 closure_put(cl);                  
686 }                                                 
687                                                   
688 static void init_append_extent(struct bch_writ    
689                                struct write_po    
690                                struct bversion    
691                                struct bch_exte    
692 {                                                 
693         struct bkey_i_extent *e;                  
694                                                   
695         op->pos.offset += crc.uncompressed_siz    
696                                                   
697         e = bkey_extent_init(op->insert_keys.t    
698         e->k.p          = op->pos;                
699         e->k.size       = crc.uncompressed_siz    
700         e->k.bversion   = version;                
701                                                   
702         if (crc.csum_type ||                      
703             crc.compression_type ||               
704             crc.nonce)                            
705                 bch2_extent_crc_append(&e->k_i    
706                                                   
707         bch2_alloc_sectors_append_ptrs_inlined    
708                                        op->fla    
709                                                   
710         bch2_keylist_push(&op->insert_keys);      
711 }                                                 
712                                                   
713 static struct bio *bch2_write_bio_alloc(struct    
714                                         struct    
715                                         struct    
716                                         bool *    
717                                         void *    
718 {                                                 
719         struct bch_write_bio *wbio;               
720         struct bio *bio;                          
721         unsigned output_available =               
722                 min(wp->sectors_free << 9, src    
723         unsigned pages = DIV_ROUND_UP(output_a    
724                                       (buf        
725                                        ? ((uns    
726                                        : 0), P    
727                                                   
728         pages = min(pages, BIO_MAX_VECS);         
729                                                   
730         bio = bio_alloc_bioset(NULL, pages, 0,    
731                                GFP_NOFS, &c->b    
732         wbio                    = wbio_init(bi    
733         wbio->put_bio           = true;           
734         /* copy WRITE_SYNC flag */                
735         wbio->bio.bi_opf        = src->bi_opf;    
736                                                   
737         if (buf) {                                
738                 bch2_bio_map(bio, buf, output_    
739                 return bio;                       
740         }                                         
741                                                   
742         wbio->bounce            = true;           
743                                                   
744         /*                                        
745          * We can't use mempool for more than     
746          * worth of pages, but we'd like to al    
747          */                                       
748         bch2_bio_alloc_pages_pool(c, bio,         
749                                   min_t(unsign    
750                                         c->opt    
751                                                   
752         if (bio->bi_iter.bi_size < output_avai    
753                 *page_alloc_failed =              
754                         bch2_bio_alloc_pages(b    
755                                              o    
756                                              b    
757                                              G    
758                                                   
759         return bio;                               
760 }                                                 
761                                                   
762 static int bch2_write_rechecksum(struct bch_fs    
763                                  struct bch_wr    
764                                  unsigned new_    
765 {                                                 
766         struct bio *bio = &op->wbio.bio;          
767         struct bch_extent_crc_unpacked new_crc    
768         int ret;                                  
769                                                   
770         /* bch2_rechecksum_bio() can't encrypt    
771                                                   
772         if (bch2_csum_type_is_encryption(op->c    
773             bch2_csum_type_is_encryption(new_c    
774                 new_csum_type = op->crc.csum_t    
775                                                   
776         ret = bch2_rechecksum_bio(c, bio, op->    
777                                   NULL, &new_c    
778                                   op->crc.offs    
779                                   new_csum_typ    
780         if (ret)                                  
781                 return ret;                       
782                                                   
783         bio_advance(bio, op->crc.offset << 9);    
784         bio->bi_iter.bi_size = op->crc.live_si    
785         op->crc = new_crc;                        
786         return 0;                                 
787 }                                                 
788                                                   
789 static int bch2_write_decrypt(struct bch_write    
790 {                                                 
791         struct bch_fs *c = op->c;                 
792         struct nonce nonce = extent_nonce(op->    
793         struct bch_csum csum;                     
794         int ret;                                  
795                                                   
796         if (!bch2_csum_type_is_encryption(op->    
797                 return 0;                         
798                                                   
799         /*                                        
800          * If we need to decrypt data in the w    
801          * to verify the existing checksum (po    
802          * it's decrypted - this is the last p    
803          * checksum:                              
804          */                                       
805         csum = bch2_checksum_bio(c, op->crc.cs    
806         if (bch2_crc_cmp(op->crc.csum, csum) &    
807                 return -EIO;                      
808                                                   
809         ret = bch2_encrypt_bio(c, op->crc.csum    
810         op->crc.csum_type = 0;                    
811         op->crc.csum = (struct bch_csum) { 0,     
812         return ret;                               
813 }                                                 
814                                                   
815 static enum prep_encoded_ret {                    
816         PREP_ENCODED_OK,                          
817         PREP_ENCODED_ERR,                         
818         PREP_ENCODED_CHECKSUM_ERR,                
819         PREP_ENCODED_DO_WRITE,                    
820 } bch2_write_prep_encoded_data(struct bch_writ    
821 {                                                 
822         struct bch_fs *c = op->c;                 
823         struct bio *bio = &op->wbio.bio;          
824                                                   
825         if (!(op->flags & BCH_WRITE_DATA_ENCOD    
826                 return PREP_ENCODED_OK;           
827                                                   
828         BUG_ON(bio_sectors(bio) != op->crc.com    
829                                                   
830         /* Can we just write the entire extent    
831         if (op->crc.uncompressed_size == op->c    
832             op->crc.uncompressed_size <= c->op    
833             op->crc.compressed_size <= wp->sec    
834             (op->crc.compression_type == bch2_    
835              op->incompressible)) {               
836                 if (!crc_is_compressed(op->crc    
837                     op->csum_type != op->crc.c    
838                     bch2_write_rechecksum(c, o    
839                     !c->opts.no_data_io)          
840                         return PREP_ENCODED_CH    
841                                                   
842                 return PREP_ENCODED_DO_WRITE;     
843         }                                         
844                                                   
845         /*                                        
846          * If the data is compressed and we co    
847          * is, we have to decompress it:          
848          */                                       
849         if (crc_is_compressed(op->crc)) {         
850                 struct bch_csum csum;             
851                                                   
852                 if (bch2_write_decrypt(op))       
853                         return PREP_ENCODED_CH    
854                                                   
855                 /* Last point we can still ver    
856                 csum = bch2_checksum_bio(c, op    
857                                          exten    
858                                          bio);    
859                 if (bch2_crc_cmp(op->crc.csum,    
860                         return PREP_ENCODED_CH    
861                                                   
862                 if (bch2_bio_uncompress_inplac    
863                         return PREP_ENCODED_ER    
864         }                                         
865                                                   
866         /*                                        
867          * No longer have compressed data afte    
868          * encrypted:                             
869          */                                       
870                                                   
871         /*                                        
872          * If the data is checksummed and we'r    
873          * rechecksum and adjust bio to point     
874          */                                       
875         if ((op->crc.live_size != op->crc.unco    
876              op->crc.csum_type != op->csum_typ    
877             bch2_write_rechecksum(c, op, op->c    
878             !c->opts.no_data_io)                  
879                 return PREP_ENCODED_CHECKSUM_E    
880                                                   
881         /*                                        
882          * If we want to compress the data, it    
883          */                                       
884         if ((op->compression_opt ||               
885              bch2_csum_type_is_encryption(op->    
886              bch2_csum_type_is_encryption(op->    
887             bch2_write_decrypt(op))               
888                 return PREP_ENCODED_CHECKSUM_E    
889                                                   
890         return PREP_ENCODED_OK;                   
891 }                                                 
892                                                   
893 static int bch2_write_extent(struct bch_write_    
894                              struct bio **_dst    
895 {                                                 
896         struct bch_fs *c = op->c;                 
897         struct bio *src = &op->wbio.bio, *dst     
898         struct bvec_iter saved_iter;              
899         void *ec_buf;                             
900         unsigned total_output = 0, total_input    
901         bool bounce = false;                      
902         bool page_alloc_failed = false;           
903         int ret, more = 0;                        
904                                                   
905         BUG_ON(!bio_sectors(src));                
906                                                   
907         ec_buf = bch2_writepoint_ec_buf(c, wp)    
908                                                   
909         switch (bch2_write_prep_encoded_data(o    
910         case PREP_ENCODED_OK:                     
911                 break;                            
912         case PREP_ENCODED_ERR:                    
913                 ret = -EIO;                       
914                 goto err;                         
915         case PREP_ENCODED_CHECKSUM_ERR:           
916                 goto csum_err;                    
917         case PREP_ENCODED_DO_WRITE:               
918                 /* XXX look for bug here */       
919                 if (ec_buf) {                     
920                         dst = bch2_write_bio_a    
921                                                   
922                                                   
923                         bio_copy_data(dst, src    
924                         bounce = true;            
925                 }                                 
926                 init_append_extent(op, wp, op-    
927                 goto do_write;                    
928         }                                         
929                                                   
930         if (ec_buf ||                             
931             op->compression_opt ||                
932             (op->csum_type &&                     
933              !(op->flags & BCH_WRITE_PAGES_STA    
934             (bch2_csum_type_is_encryption(op->    
935              !(op->flags & BCH_WRITE_PAGES_OWN    
936                 dst = bch2_write_bio_alloc(c,     
937                                            &pa    
938                                            ec_    
939                 bounce = true;                    
940         }                                         
941                                                   
942         saved_iter = dst->bi_iter;                
943                                                   
944         do {                                      
945                 struct bch_extent_crc_unpacked    
946                 struct bversion version = op->    
947                 size_t dst_len = 0, src_len =     
948                                                   
949                 if (page_alloc_failed &&          
950                     dst->bi_iter.bi_size  < (w    
951                     dst->bi_iter.bi_size < c->    
952                         break;                    
953                                                   
954                 BUG_ON(op->compression_opt &&     
955                        (op->flags & BCH_WRITE_    
956                        bch2_csum_type_is_encry    
957                 BUG_ON(op->compression_opt &&     
958                                                   
959                 crc.compression_type = op->inc    
960                         ? BCH_COMPRESSION_TYPE    
961                         : op->compression_opt     
962                         ? bch2_bio_compress(c,    
963                                             op    
964                         : 0;                      
965                 if (!crc_is_compressed(crc)) {    
966                         dst_len = min(dst->bi_    
967                         dst_len = min_t(unsign    
968                                                   
969                         if (op->csum_type)        
970                                 dst_len = min_    
971                                                   
972                                                   
973                         if (bounce) {             
974                                 swap(dst->bi_i    
975                                 bio_copy_data(    
976                                 swap(dst->bi_i    
977                         }                         
978                                                   
979                         src_len = dst_len;        
980                 }                                 
981                                                   
982                 BUG_ON(!src_len || !dst_len);     
983                                                   
984                 if (bch2_csum_type_is_encrypti    
985                         if (bversion_zero(vers    
986                                 version.lo = a    
987                         } else {                  
988                                 crc.nonce = op    
989                                 op->nonce += s    
990                         }                         
991                 }                                 
992                                                   
993                 if ((op->flags & BCH_WRITE_DAT    
994                     !crc_is_compressed(crc) &&    
995                     bch2_csum_type_is_encrypti    
996                     bch2_csum_type_is_encrypti    
997                         u8 compression_type =     
998                         u16 nonce = crc.nonce;    
999                         /*                        
1000                          * Note: when we're u    
1001                          * checksumming @src     
1002                          * existing checksum     
1003                          * were trying to com    
1004                          * part of the data t    
1005                          *                       
1006                          * But normally we wa    
1007                          * because part of th    
1008                          * data can't be modi    
1009                          * flight.               
1010                          */                      
1011                         if (bch2_rechecksum_b    
1012                                         &crc,    
1013                                         src_l    
1014                                         bio_s    
1015                                         op->c    
1016                                 goto csum_err    
1017                         /*                       
1018                          * rchecksum_bio sets    
1019                          * this isn't always     
1020                          * an extent from unc    
1021                          */                      
1022                         crc.compression_type     
1023                         crc.nonce = nonce;       
1024                 } else {                         
1025                         if ((op->flags & BCH_    
1026                             bch2_rechecksum_b    
1027                                         NULL,    
1028                                         src_l    
1029                                         bio_s    
1030                                         op->c    
1031                                 goto csum_err    
1032                                                  
1033                         crc.compressed_size      
1034                         crc.uncompressed_size    
1035                         crc.live_size            
1036                                                  
1037                         swap(dst->bi_iter.bi_    
1038                         ret = bch2_encrypt_bi    
1039                                                  
1040                         if (ret)                 
1041                                 goto err;        
1042                                                  
1043                         crc.csum = bch2_check    
1044                                          exte    
1045                         crc.csum_type = op->c    
1046                         swap(dst->bi_iter.bi_    
1047                 }                                
1048                                                  
1049                 init_append_extent(op, wp, ve    
1050                                                  
1051                 if (dst != src)                  
1052                         bio_advance(dst, dst_    
1053                 bio_advance(src, src_len);       
1054                 total_output    += dst_len;      
1055                 total_input     += src_len;      
1056         } while (dst->bi_iter.bi_size &&         
1057                  src->bi_iter.bi_size &&         
1058                  wp->sectors_free &&             
1059                  !bch2_keylist_realloc(&op->i    
1060                                       op->inl    
1061                                       ARRAY_S    
1062                                       BKEY_EX    
1063                                                  
1064         more = src->bi_iter.bi_size != 0;        
1065                                                  
1066         dst->bi_iter = saved_iter;               
1067                                                  
1068         if (dst == src && more) {                
1069                 BUG_ON(total_output != total_    
1070                                                  
1071                 dst = bio_split(src, total_in    
1072                                 GFP_NOFS, &c-    
1073                 wbio_init(dst)->put_bio = tru    
1074                 /* copy WRITE_SYNC flag */       
1075                 dst->bi_opf             = src    
1076         }                                        
1077                                                  
1078         dst->bi_iter.bi_size = total_output;     
1079 do_write:                                        
1080         *_dst = dst;                             
1081         return more;                             
1082 csum_err:                                        
1083         bch_err_inum_offset_ratelimited(c,       
1084                 op->pos.inode,                   
1085                 op->pos.offset << 9,             
1086                 "%s write error: error verify    
1087                 op->flags & BCH_WRITE_MOVE ?     
1088         ret = -EIO;                              
1089 err:                                             
1090         if (to_wbio(dst)->bounce)                
1091                 bch2_bio_free_pages_pool(c, d    
1092         if (to_wbio(dst)->put_bio)               
1093                 bio_put(dst);                    
1094                                                  
1095         return ret;                              
1096 }                                                
1097                                                  
1098 static bool bch2_extent_is_writeable(struct b    
1099                                      struct b    
1100 {                                                
1101         struct bch_fs *c = op->c;                
1102         struct bkey_s_c_extent e;                
1103         struct extent_ptr_decoded p;             
1104         const union bch_extent_entry *entry;     
1105         unsigned replicas = 0;                   
1106                                                  
1107         if (k.k->type != KEY_TYPE_extent)        
1108                 return false;                    
1109                                                  
1110         e = bkey_s_c_to_extent(k);               
1111                                                  
1112         rcu_read_lock();                         
1113         extent_for_each_ptr_decode(e, p, entr    
1114                 if (crc_is_encoded(p.crc) ||     
1115                         rcu_read_unlock();       
1116                         return false;            
1117                 }                                
1118                                                  
1119                 replicas += bch2_extent_ptr_d    
1120         }                                        
1121         rcu_read_unlock();                       
1122                                                  
1123         return replicas >= op->opts.data_repl    
1124 }                                                
1125                                                  
1126 static int bch2_nocow_write_convert_one_unwri    
1127                                                  
1128                                                  
1129                                                  
1130                                                  
1131 {                                                
1132         if (!bch2_extents_match(bkey_i_to_s_c    
1133                 /* trace this */                 
1134                 return 0;                        
1135         }                                        
1136                                                  
1137         struct bkey_i *new = bch2_bkey_make_m    
1138         int ret = PTR_ERR_OR_ZERO(new);          
1139         if (ret)                                 
1140                 return ret;                      
1141                                                  
1142         bch2_cut_front(bkey_start_pos(&orig->    
1143         bch2_cut_back(orig->k.p, new);           
1144                                                  
1145         struct bkey_ptrs ptrs = bch2_bkey_ptr    
1146         bkey_for_each_ptr(ptrs, ptr)             
1147                 ptr->unwritten = 0;              
1148                                                  
1149         /*                                       
1150          * Note that we're not calling bch2_s    
1151          * that was done when we kicked off t    
1152          * that we update the extent that we     
1153          * since been created. The write is s    
1154          * w.r.t. snapshot atomicity:            
1155          */                                      
1156         return  bch2_extent_update_i_size_sec    
1157                                         min(n    
1158                 bch2_trans_update(trans, iter    
1159                                   BTREE_UPDAT    
1160 }                                                
1161                                                  
1162 static void bch2_nocow_write_convert_unwritte    
1163 {                                                
1164         struct bch_fs *c = op->c;                
1165         struct btree_trans *trans = bch2_tran    
1166                                                  
1167         for_each_keylist_key(&op->insert_keys    
1168                 int ret = for_each_btree_key_    
1169                                      bkey_sta    
1170                                      BTREE_IT    
1171                                      NULL, NU    
1172                         bch2_nocow_write_conv    
1173                 }));                             
1174                                                  
1175                 if (ret && !bch2_err_matches(    
1176                         struct bkey_i *insert    
1177                                                  
1178                         bch_err_inum_offset_r    
1179                                 insert->k.p.i    
1180                                 "%s write err    
1181                                 op->flags & B    
1182                                 bch2_err_str(    
1183                 }                                
1184                                                  
1185                 if (ret) {                       
1186                         op->error = ret;         
1187                         break;                   
1188                 }                                
1189         }                                        
1190                                                  
1191         bch2_trans_put(trans);                   
1192 }                                                
1193                                                  
1194 static void __bch2_nocow_write_done(struct bc    
1195 {                                                
1196         if (unlikely(op->flags & BCH_WRITE_IO    
1197                 op->error = -EIO;                
1198         } else if (unlikely(op->flags & BCH_W    
1199                 bch2_nocow_write_convert_unwr    
1200 }                                                
1201                                                  
1202 static CLOSURE_CALLBACK(bch2_nocow_write_done    
1203 {                                                
1204         closure_type(op, struct bch_write_op,    
1205                                                  
1206         __bch2_nocow_write_done(op);             
1207         bch2_write_done(cl);                     
1208 }                                                
1209                                                  
1210 struct bucket_to_lock {                          
1211         struct bpos             b;               
1212         unsigned                gen;             
1213         struct nocow_lock_bucket *l;             
1214 };                                               
1215                                                  
1216 static void bch2_nocow_write(struct bch_write    
1217 {                                                
1218         struct bch_fs *c = op->c;                
1219         struct btree_trans *trans;               
1220         struct btree_iter iter;                  
1221         struct bkey_s_c k;                       
1222         DARRAY_PREALLOCATED(struct bucket_to_    
1223         u32 snapshot;                            
1224         struct bucket_to_lock *stale_at;         
1225         int stale, ret;                          
1226                                                  
1227         if (op->flags & BCH_WRITE_MOVE)          
1228                 return;                          
1229                                                  
1230         darray_init(&buckets);                   
1231         trans = bch2_trans_get(c);               
1232 retry:                                           
1233         bch2_trans_begin(trans);                 
1234                                                  
1235         ret = bch2_subvolume_get_snapshot(tra    
1236         if (unlikely(ret))                       
1237                 goto err;                        
1238                                                  
1239         bch2_trans_iter_init(trans, &iter, BT    
1240                              SPOS(op->pos.ino    
1241                              BTREE_ITER_slots    
1242         while (1) {                              
1243                 struct bio *bio = &op->wbio.b    
1244                                                  
1245                 buckets.nr = 0;                  
1246                                                  
1247                 ret = bch2_trans_relock(trans    
1248                 if (ret)                         
1249                         break;                   
1250                                                  
1251                 k = bch2_btree_iter_peek_slot    
1252                 ret = bkey_err(k);               
1253                 if (ret)                         
1254                         break;                   
1255                                                  
1256                 /* fall back to normal cow wr    
1257                 if (unlikely(k.k->p.snapshot     
1258                              !bch2_extent_is_    
1259                         break;                   
1260                                                  
1261                 if (bch2_keylist_realloc(&op-    
1262                                          op->    
1263                                          ARRA    
1264                                          k.k-    
1265                         break;                   
1266                                                  
1267                 /* Get iorefs before dropping    
1268                 struct bkey_ptrs_c ptrs = bch    
1269                 bkey_for_each_ptr(ptrs, ptr)     
1270                         struct bch_dev *ca =     
1271                         if (unlikely(!ca))       
1272                                 goto err_get_    
1273                                                  
1274                         struct bpos b = PTR_B    
1275                         struct nocow_lock_buc    
1276                                 bucket_nocow_    
1277                         prefetch(l);             
1278                                                  
1279                         /* XXX allocating mem    
1280                         darray_push_gfp(&buck    
1281                                                  
1282                                                  
1283                                                  
1284                         if (ptr->unwritten)      
1285                                 op->flags |=     
1286                 }                                
1287                                                  
1288                 /* Unlock before taking nocow    
1289                 bkey_reassemble(op->insert_ke    
1290                 bch2_trans_unlock(trans);        
1291                                                  
1292                 bch2_cut_front(op->pos, op->i    
1293                 if (op->flags & BCH_WRITE_CON    
1294                         bch2_cut_back(POS(op-    
1295                                                  
1296                 darray_for_each(buckets, i) {    
1297                         struct bch_dev *ca =     
1298                                                  
1299                         __bch2_bucket_nocow_l    
1300                                                  
1301                                                  
1302                                                  
1303                         int gen = bucket_gen_    
1304                         stale = gen < 0 ? gen    
1305                         if (unlikely(stale))     
1306                                 stale_at = i;    
1307                                 goto err_buck    
1308                         }                        
1309                 }                                
1310                                                  
1311                 bio = &op->wbio.bio;             
1312                 if (k.k->p.offset < op->pos.o    
1313                         bio = bio_split(bio,     
1314                                         GFP_K    
1315                         wbio_init(bio)->put_b    
1316                         bio->bi_opf = op->wbi    
1317                 } else {                         
1318                         op->flags |= BCH_WRIT    
1319                 }                                
1320                                                  
1321                 op->pos.offset += bio_sectors    
1322                 op->written += bio_sectors(bi    
1323                                                  
1324                 bio->bi_end_io  = bch2_write_    
1325                 bio->bi_private = &op->cl;       
1326                 bio->bi_opf |= REQ_OP_WRITE;     
1327                 closure_get(&op->cl);            
1328                 bch2_submit_wbio_replicas(to_    
1329                                           op-    
1330                                                  
1331                 bch2_keylist_push(&op->insert    
1332                 if (op->flags & BCH_WRITE_SUB    
1333                         break;                   
1334                 bch2_btree_iter_advance(&iter    
1335         }                                        
1336 out:                                             
1337         bch2_trans_iter_exit(trans, &iter);      
1338 err:                                             
1339         if (bch2_err_matches(ret, BCH_ERR_tra    
1340                 goto retry;                      
1341                                                  
1342         if (ret) {                               
1343                 bch_err_inum_offset_ratelimit    
1344                         op->pos.inode, op->po    
1345                         "%s: btree lookup err    
1346                 op->error = ret;                 
1347                 op->flags |= BCH_WRITE_SUBMIT    
1348         }                                        
1349                                                  
1350         bch2_trans_put(trans);                   
1351         darray_exit(&buckets);                   
1352                                                  
1353         /* fallback to cow write path? */        
1354         if (!(op->flags & BCH_WRITE_SUBMITTED    
1355                 closure_sync(&op->cl);           
1356                 __bch2_nocow_write_done(op);     
1357                 op->insert_keys.top = op->ins    
1358         } else if (op->flags & BCH_WRITE_SYNC    
1359                 closure_sync(&op->cl);           
1360                 bch2_nocow_write_done(&op->cl    
1361         } else {                                 
1362                 /*                               
1363                  * XXX                           
1364                  * needs to run out of proces    
1365                  * a mutex                       
1366                  */                              
1367                 continue_at(&op->cl, bch2_noc    
1368         }                                        
1369         return;                                  
1370 err_get_ioref:                                   
1371         darray_for_each(buckets, i)              
1372                 percpu_ref_put(&bch2_dev_have    
1373                                                  
1374         /* Fall back to COW path: */             
1375         goto out;                                
1376 err_bucket_stale:                                
1377         darray_for_each(buckets, i) {            
1378                 bch2_bucket_nocow_unlock(&c->    
1379                 if (i == stale_at)               
1380                         break;                   
1381         }                                        
1382                                                  
1383         struct printbuf buf = PRINTBUF;          
1384         if (bch2_fs_inconsistent_on(stale < 0    
1385                                     "pointer     
1386                                     stale_at-    
1387                                     (bch2_bke    
1388                 ret = -EIO;                      
1389         } else {                                 
1390                 /* We can retry this: */         
1391                 ret = -BCH_ERR_transaction_re    
1392         }                                        
1393         printbuf_exit(&buf);                     
1394                                                  
1395         goto err_get_ioref;                      
1396 }                                                
1397                                                  
1398 static void __bch2_write(struct bch_write_op     
1399 {                                                
1400         struct bch_fs *c = op->c;                
1401         struct write_point *wp = NULL;           
1402         struct bio *bio = NULL;                  
1403         unsigned nofs_flags;                     
1404         int ret;                                 
1405                                                  
1406         nofs_flags = memalloc_nofs_save();       
1407                                                  
1408         if (unlikely(op->opts.nocow && c->opt    
1409                 bch2_nocow_write(op);            
1410                 if (op->flags & BCH_WRITE_SUB    
1411                         goto out_nofs_restore    
1412         }                                        
1413 again:                                           
1414         memset(&op->failed, 0, sizeof(op->fai    
1415                                                  
1416         do {                                     
1417                 struct bkey_i *key_to_write;     
1418                 unsigned key_to_write_offset     
1419                         op->insert_keys.keys_    
1420                                                  
1421                 /* +1 for possible cache devi    
1422                 if (op->open_buckets.nr + op-    
1423                     ARRAY_SIZE(op->open_bucke    
1424                         break;                   
1425                                                  
1426                 if (bch2_keylist_realloc(&op-    
1427                                         op->i    
1428                                         ARRAY    
1429                                         BKEY_    
1430                         break;                   
1431                                                  
1432                 /*                               
1433                  * The copygc thread is now g    
1434                  * freeing up space on specif    
1435                  * allocations for specific d    
1436                  */                              
1437                 ret = bch2_trans_run(c, lockr    
1438                         bch2_alloc_sectors_st    
1439                                 op->target,      
1440                                 op->opts.eras    
1441                                 op->write_poi    
1442                                 &op->devs_hav    
1443                                 op->nr_replic    
1444                                 op->nr_replic    
1445                                 op->watermark    
1446                                 op->flags,       
1447                                 &op->cl, &wp)    
1448                 if (unlikely(ret)) {             
1449                         if (bch2_err_matches(    
1450                                 break;           
1451                                                  
1452                         goto err;                
1453                 }                                
1454                                                  
1455                 EBUG_ON(!wp);                    
1456                                                  
1457                 bch2_open_bucket_get(c, wp, &    
1458                 ret = bch2_write_extent(op, w    
1459                                                  
1460                 bch2_alloc_sectors_done_inlin    
1461 err:                                             
1462                 if (ret <= 0) {                  
1463                         op->flags |= BCH_WRIT    
1464                                                  
1465                         if (ret < 0) {           
1466                                 if (!(op->fla    
1467                                         bch_e    
1468                                                  
1469                                                  
1470                                                  
1471                                                  
1472                                                  
1473                                 op->error = r    
1474                                 break;           
1475                         }                        
1476                 }                                
1477                                                  
1478                 bio->bi_end_io  = bch2_write_    
1479                 bio->bi_private = &op->cl;       
1480                 bio->bi_opf |= REQ_OP_WRITE;     
1481                                                  
1482                 closure_get(bio->bi_private);    
1483                                                  
1484                 key_to_write = (void *) (op->    
1485                                          key_    
1486                                                  
1487                 bch2_submit_wbio_replicas(to_    
1488                                           key    
1489         } while (ret);                           
1490                                                  
1491         /*                                       
1492          * Sync or no?                           
1493          *                                       
1494          * If we're running asynchronously, w    
1495          * synchronously here if we weren't a    
1496          * once, as that signals backpressure    
1497          */                                      
1498         if ((op->flags & BCH_WRITE_SYNC) ||      
1499             (!(op->flags & BCH_WRITE_SUBMITTE    
1500              !(op->flags & BCH_WRITE_IN_WORKE    
1501                 bch2_wait_on_allocator(c, &op    
1502                                                  
1503                 __bch2_write_index(op);          
1504                                                  
1505                 if (!(op->flags & BCH_WRITE_S    
1506                         goto again;              
1507                 bch2_write_done(&op->cl);        
1508         } else {                                 
1509                 bch2_write_queue(op, wp);        
1510                 continue_at(&op->cl, bch2_wri    
1511         }                                        
1512 out_nofs_restore:                                
1513         memalloc_nofs_restore(nofs_flags);       
1514 }                                                
1515                                                  
1516 static void bch2_write_data_inline(struct bch    
1517 {                                                
1518         struct bio *bio = &op->wbio.bio;         
1519         struct bvec_iter iter;                   
1520         struct bkey_i_inline_data *id;           
1521         unsigned sectors;                        
1522         int ret;                                 
1523                                                  
1524         memset(&op->failed, 0, sizeof(op->fai    
1525                                                  
1526         op->flags |= BCH_WRITE_WROTE_DATA_INL    
1527         op->flags |= BCH_WRITE_SUBMITTED;        
1528                                                  
1529         bch2_check_set_feature(op->c, BCH_FEA    
1530                                                  
1531         ret = bch2_keylist_realloc(&op->inser    
1532                                    ARRAY_SIZE    
1533                                    BKEY_U64s     
1534         if (ret) {                               
1535                 op->error = ret;                 
1536                 goto err;                        
1537         }                                        
1538                                                  
1539         sectors = bio_sectors(bio);              
1540         op->pos.offset += sectors;               
1541                                                  
1542         id = bkey_inline_data_init(op->insert    
1543         id->k.p         = op->pos;               
1544         id->k.bversion  = op->version;           
1545         id->k.size      = sectors;               
1546                                                  
1547         iter = bio->bi_iter;                     
1548         iter.bi_size = data_len;                 
1549         memcpy_from_bio(id->v.data, bio, iter    
1550                                                  
1551         while (data_len & 7)                     
1552                 id->v.data[data_len++] = '\0'    
1553         set_bkey_val_bytes(&id->k, data_len);    
1554         bch2_keylist_push(&op->insert_keys);     
1555                                                  
1556         __bch2_write_index(op);                  
1557 err:                                             
1558         bch2_write_done(&op->cl);                
1559 }                                                
1560                                                  
1561 /**                                              
1562  * bch2_write() - handle a write to a cache d    
1563  * @cl:         &bch_write_op->cl                
1564  *                                               
1565  * This is the starting point for any data to    
1566  * be from a normal write, or a writeback wri    
1567  * volume - it's also used by the moving garb    
1568  * mostly empty buckets.                         
1569  *                                               
1570  * It first writes the data to the cache, cre    
1571  * (if the data won't fit in a single open bu    
1572  * after the data is written it calls bch_jou    
1573  * added to the next journal write they're in    
1574  *                                               
1575  * If op->discard is true, instead of inserti    
1576  * region of the cache represented by op->bio    
1577  */                                              
1578 CLOSURE_CALLBACK(bch2_write)                     
1579 {                                                
1580         closure_type(op, struct bch_write_op,    
1581         struct bio *bio = &op->wbio.bio;         
1582         struct bch_fs *c = op->c;                
1583         unsigned data_len;                       
1584                                                  
1585         EBUG_ON(op->cl.parent);                  
1586         BUG_ON(!op->nr_replicas);                
1587         BUG_ON(!op->write_point.v);              
1588         BUG_ON(bkey_eq(op->pos, POS_MAX));       
1589                                                  
1590         if (op->flags & BCH_WRITE_ONLY_SPECIF    
1591                 op->flags |= BCH_WRITE_ALLOC_    
1592                                                  
1593         op->nr_replicas_required = min_t(unsi    
1594         op->start_time = local_clock();          
1595         bch2_keylist_init(&op->insert_keys, o    
1596         wbio_init(bio)->put_bio = false;         
1597                                                  
1598         if (bio->bi_iter.bi_size & (c->opts.b    
1599                 bch_err_inum_offset_ratelimit    
1600                         op->pos.inode,           
1601                         op->pos.offset << 9,     
1602                         "%s write error: misa    
1603                         op->flags & BCH_WRITE    
1604                 op->error = -EIO;                
1605                 goto err;                        
1606         }                                        
1607                                                  
1608         if (c->opts.nochanges) {                 
1609                 op->error = -BCH_ERR_erofs_no    
1610                 goto err;                        
1611         }                                        
1612                                                  
1613         if (!(op->flags & BCH_WRITE_MOVE) &&     
1614             !bch2_write_ref_tryget(c, BCH_WRI    
1615                 op->error = -BCH_ERR_erofs_no    
1616                 goto err;                        
1617         }                                        
1618                                                  
1619         this_cpu_add(c->counters[BCH_COUNTER_    
1620         bch2_increment_clock(c, bio_sectors(b    
1621                                                  
1622         data_len = min_t(u64, bio->bi_iter.bi    
1623                          op->new_i_size - (op    
1624                                                  
1625         if (c->opts.inline_data &&               
1626             data_len <= min(block_bytes(c) /     
1627                 bch2_write_data_inline(op, da    
1628                 return;                          
1629         }                                        
1630                                                  
1631         __bch2_write(op);                        
1632         return;                                  
1633 err:                                             
1634         bch2_disk_reservation_put(c, &op->res    
1635                                                  
1636         closure_debug_destroy(&op->cl);          
1637         if (op->end_io)                          
1638                 op->end_io(op);                  
1639 }                                                
1640                                                  
1641 static const char * const bch2_write_flags[]     
1642 #define x(f)    #f,                              
1643         BCH_WRITE_FLAGS()                        
1644 #undef x                                         
1645         NULL                                     
1646 };                                               
1647                                                  
1648 void bch2_write_op_to_text(struct printbuf *o    
1649 {                                                
1650         prt_str(out, "pos: ");                   
1651         bch2_bpos_to_text(out, op->pos);         
1652         prt_newline(out);                        
1653         printbuf_indent_add(out, 2);             
1654                                                  
1655         prt_str(out, "started: ");               
1656         bch2_pr_time_units(out, local_clock()    
1657         prt_newline(out);                        
1658                                                  
1659         prt_str(out, "flags: ");                 
1660         prt_bitflags(out, bch2_write_flags, o    
1661         prt_newline(out);                        
1662                                                  
1663         prt_printf(out, "ref: %u\n", closure_    
1664                                                  
1665         printbuf_indent_sub(out, 2);             
1666 }                                                
1667                                                  
1668 void bch2_fs_io_write_exit(struct bch_fs *c)     
1669 {                                                
1670         mempool_exit(&c->bio_bounce_pages);      
1671         bioset_exit(&c->replica_set);            
1672         bioset_exit(&c->bio_write);              
1673 }                                                
1674                                                  
1675 int bch2_fs_io_write_init(struct bch_fs *c)      
1676 {                                                
1677         if (bioset_init(&c->bio_write,   1, o    
1678             bioset_init(&c->replica_set, 4, o    
1679                 return -BCH_ERR_ENOMEM_bio_wr    
1680                                                  
1681         if (mempool_init_page_pool(&c->bio_bo    
1682                                    max_t(unsi    
1683                                          c->o    
1684                                          c->o    
1685                                    PAGE_SIZE,    
1686                 return -BCH_ERR_ENOMEM_bio_bo    
1687                                                  
1688         return 0;                                
1689 }                                                
1690                                                  

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php