1 // SPDX-License-Identifier: GPL-2.0 1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 2 #ifndef NO_BCACHEFS_FS 3 3 4 #include "bcachefs.h" 4 #include "bcachefs.h" 5 #include "alloc_foreground.h" 5 #include "alloc_foreground.h" 6 #include "bkey_buf.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 7 #include "btree_update.h" 8 #include "buckets.h" 8 #include "buckets.h" 9 #include "clock.h" 9 #include "clock.h" 10 #include "error.h" 10 #include "error.h" 11 #include "extents.h" 11 #include "extents.h" 12 #include "extent_update.h" 12 #include "extent_update.h" 13 #include "fs.h" 13 #include "fs.h" 14 #include "fs-io.h" 14 #include "fs-io.h" 15 #include "fs-io-buffered.h" 15 #include "fs-io-buffered.h" 16 #include "fs-io-pagecache.h" 16 #include "fs-io-pagecache.h" 17 #include "fsck.h" 17 #include "fsck.h" 18 #include "inode.h" 18 #include "inode.h" 19 #include "journal.h" 19 #include "journal.h" 20 #include "io_misc.h" 20 #include "io_misc.h" 21 #include "keylist.h" 21 #include "keylist.h" 22 #include "quota.h" 22 #include "quota.h" 23 #include "reflink.h" 23 #include "reflink.h" 24 #include "trace.h" 24 #include "trace.h" 25 25 26 #include <linux/aio.h> 26 #include <linux/aio.h> 27 #include <linux/backing-dev.h> 27 #include <linux/backing-dev.h> 28 #include <linux/falloc.h> 28 #include <linux/falloc.h> 29 #include <linux/migrate.h> 29 #include <linux/migrate.h> 30 #include <linux/mmu_context.h> 30 #include <linux/mmu_context.h> 31 #include <linux/pagevec.h> 31 #include <linux/pagevec.h> 32 #include <linux/rmap.h> 32 #include <linux/rmap.h> 33 #include <linux/sched/signal.h> 33 #include <linux/sched/signal.h> 34 #include <linux/task_io_accounting_ops.h> 34 #include <linux/task_io_accounting_ops.h> 35 #include <linux/uio.h> 35 #include <linux/uio.h> 36 36 37 #include <trace/events/writeback.h> 37 #include <trace/events/writeback.h> 38 38 39 struct nocow_flush { 39 struct nocow_flush { 40 struct closure *cl; 40 struct closure *cl; 41 struct bch_dev *ca; 41 struct bch_dev *ca; 42 struct bio bio; 42 struct bio bio; 43 }; 43 }; 44 44 45 static void nocow_flush_endio(struct bio *_bio 45 static void nocow_flush_endio(struct bio *_bio) 46 { 46 { 47 47 48 struct nocow_flush *bio = container_of 48 struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 49 49 50 closure_put(bio->cl); 50 closure_put(bio->cl); 51 percpu_ref_put(&bio->ca->io_ref); 51 percpu_ref_put(&bio->ca->io_ref); 52 bio_put(&bio->bio); 52 bio_put(&bio->bio); 53 } 53 } 54 54 55 void bch2_inode_flush_nocow_writes_async(struc 55 void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 56 struc 56 struct bch_inode_info *inode, 57 struc 57 struct closure *cl) 58 { 58 { 59 struct nocow_flush *bio; 59 struct nocow_flush *bio; 60 struct bch_dev *ca; 60 struct bch_dev *ca; 61 struct bch_devs_mask devs; 61 struct bch_devs_mask devs; 62 unsigned dev; 62 unsigned dev; 63 63 64 dev = find_first_bit(inode->ei_devs_ne 64 dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 65 if (dev == BCH_SB_MEMBERS_MAX) 65 if (dev == BCH_SB_MEMBERS_MAX) 66 return; 66 return; 67 67 68 devs = inode->ei_devs_need_flush; 68 devs = inode->ei_devs_need_flush; 69 memset(&inode->ei_devs_need_flush, 0, 69 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 70 70 71 for_each_set_bit(dev, devs.d, BCH_SB_M 71 for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 72 rcu_read_lock(); 72 rcu_read_lock(); 73 ca = rcu_dereference(c->devs[d 73 ca = rcu_dereference(c->devs[dev]); 74 if (ca && !percpu_ref_tryget(& 74 if (ca && !percpu_ref_tryget(&ca->io_ref)) 75 ca = NULL; 75 ca = NULL; 76 rcu_read_unlock(); 76 rcu_read_unlock(); 77 77 78 if (!ca) 78 if (!ca) 79 continue; 79 continue; 80 80 81 bio = container_of(bio_alloc_b 81 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 82 82 REQ_OP_WRITE|REQ_PREFLUSH, 83 83 GFP_KERNEL, 84 84 &c->nocow_flush_bioset), 85 struct noco 85 struct nocow_flush, bio); 86 bio->cl = cl; 86 bio->cl = cl; 87 bio->ca = ca; 87 bio->ca = ca; 88 bio->bio.bi_end_io = noco 88 bio->bio.bi_end_io = nocow_flush_endio; 89 closure_bio_submit(&bio->bio, 89 closure_bio_submit(&bio->bio, cl); 90 } 90 } 91 } 91 } 92 92 93 static int bch2_inode_flush_nocow_writes(struc 93 static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 94 struc 94 struct bch_inode_info *inode) 95 { 95 { 96 struct closure cl; 96 struct closure cl; 97 97 98 closure_init_stack(&cl); 98 closure_init_stack(&cl); 99 bch2_inode_flush_nocow_writes_async(c, 99 bch2_inode_flush_nocow_writes_async(c, inode, &cl); 100 closure_sync(&cl); 100 closure_sync(&cl); 101 101 102 return 0; 102 return 0; 103 } 103 } 104 104 105 /* i_size updates: */ 105 /* i_size updates: */ 106 106 107 struct inode_new_size { 107 struct inode_new_size { 108 loff_t new_size; 108 loff_t new_size; 109 u64 now; 109 u64 now; 110 unsigned fields; 110 unsigned fields; 111 }; 111 }; 112 112 113 static int inode_set_size(struct btree_trans * 113 static int inode_set_size(struct btree_trans *trans, 114 struct bch_inode_inf 114 struct bch_inode_info *inode, 115 struct bch_inode_unp 115 struct bch_inode_unpacked *bi, 116 void *p) 116 void *p) 117 { 117 { 118 struct inode_new_size *s = p; 118 struct inode_new_size *s = p; 119 119 120 bi->bi_size = s->new_size; 120 bi->bi_size = s->new_size; 121 if (s->fields & ATTR_ATIME) 121 if (s->fields & ATTR_ATIME) 122 bi->bi_atime = s->now; 122 bi->bi_atime = s->now; 123 if (s->fields & ATTR_MTIME) 123 if (s->fields & ATTR_MTIME) 124 bi->bi_mtime = s->now; 124 bi->bi_mtime = s->now; 125 if (s->fields & ATTR_CTIME) 125 if (s->fields & ATTR_CTIME) 126 bi->bi_ctime = s->now; 126 bi->bi_ctime = s->now; 127 127 128 return 0; 128 return 0; 129 } 129 } 130 130 131 int __must_check bch2_write_inode_size(struct 131 int __must_check bch2_write_inode_size(struct bch_fs *c, 132 struct 132 struct bch_inode_info *inode, 133 loff_t 133 loff_t new_size, unsigned fields) 134 { 134 { 135 struct inode_new_size s = { 135 struct inode_new_size s = { 136 .new_size = new_size, 136 .new_size = new_size, 137 .now = bch2_current 137 .now = bch2_current_time(c), 138 .fields = fields, 138 .fields = fields, 139 }; 139 }; 140 140 141 return bch2_write_inode(c, inode, inod 141 return bch2_write_inode(c, inode, inode_set_size, &s, fields); 142 } 142 } 143 143 144 void __bch2_i_sectors_acct(struct bch_fs *c, s 144 void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 145 struct quota_res *q 145 struct quota_res *quota_res, s64 sectors) 146 { 146 { 147 bch2_fs_inconsistent_on((s64) inode->v 147 bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, 148 "inode %lu i_b 148 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", 149 inode->v.i_ino 149 inode->v.i_ino, (u64) inode->v.i_blocks, sectors, 150 inode->ei_inod 150 inode->ei_inode.bi_sectors); 151 inode->v.i_blocks += sectors; 151 inode->v.i_blocks += sectors; 152 152 153 #ifdef CONFIG_BCACHEFS_QUOTA 153 #ifdef CONFIG_BCACHEFS_QUOTA 154 if (quota_res && 154 if (quota_res && 155 !test_bit(EI_INODE_SNAPSHOT, &inod 155 !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && 156 sectors > 0) { 156 sectors > 0) { 157 BUG_ON(sectors > quota_res->se 157 BUG_ON(sectors > quota_res->sectors); 158 BUG_ON(sectors > inode->ei_quo 158 BUG_ON(sectors > inode->ei_quota_reserved); 159 159 160 quota_res->sectors -= sectors; 160 quota_res->sectors -= sectors; 161 inode->ei_quota_reserved -= se 161 inode->ei_quota_reserved -= sectors; 162 } else { 162 } else { 163 bch2_quota_acct(c, inode->ei_q 163 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 164 } 164 } 165 #endif 165 #endif 166 } 166 } 167 167 168 /* fsync: */ 168 /* fsync: */ 169 169 170 /* 170 /* 171 * inode->ei_inode.bi_journal_seq won't be up 171 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 172 * insert trigger: look up the btree inode ins 172 * insert trigger: look up the btree inode instead 173 */ 173 */ 174 static int bch2_flush_inode(struct bch_fs *c, 174 static int bch2_flush_inode(struct bch_fs *c, 175 struct bch_inode_i 175 struct bch_inode_info *inode) 176 { 176 { 177 if (c->opts.journal_flush_disabled) 177 if (c->opts.journal_flush_disabled) 178 return 0; 178 return 0; 179 179 180 if (!bch2_write_ref_tryget(c, BCH_WRIT 180 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) 181 return -EROFS; 181 return -EROFS; 182 182 183 struct bch_inode_unpacked u; 183 struct bch_inode_unpacked u; 184 int ret = bch2_inode_find_by_inum(c, i 184 int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: 185 bch2_journal_flush_seq(&c->j !! 185 bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: 186 bch2_inode_flush_nocow_write 186 bch2_inode_flush_nocow_writes(c, inode); 187 bch2_write_ref_put(c, BCH_WRITE_REF_fs 187 bch2_write_ref_put(c, BCH_WRITE_REF_fsync); 188 return ret; 188 return ret; 189 } 189 } 190 190 191 int bch2_fsync(struct file *file, loff_t start 191 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) 192 { 192 { 193 struct bch_inode_info *inode = file_bc 193 struct bch_inode_info *inode = file_bch_inode(file); 194 struct bch_fs *c = inode->v.i_sb->s_fs 194 struct bch_fs *c = inode->v.i_sb->s_fs_info; 195 int ret, err; !! 195 int ret; 196 << 197 trace_bch2_fsync(file, datasync); << 198 196 199 ret = file_write_and_wait_range(file, 197 ret = file_write_and_wait_range(file, start, end); 200 if (ret) 198 if (ret) 201 goto out; 199 goto out; 202 ret = sync_inode_metadata(&inode->v, 1 200 ret = sync_inode_metadata(&inode->v, 1); 203 if (ret) 201 if (ret) 204 goto out; 202 goto out; 205 ret = bch2_flush_inode(c, inode); 203 ret = bch2_flush_inode(c, inode); 206 out: 204 out: 207 ret = bch2_err_class(ret); 205 ret = bch2_err_class(ret); 208 if (ret == -EROFS) 206 if (ret == -EROFS) 209 ret = -EIO; 207 ret = -EIO; 210 << 211 err = file_check_and_advance_wb_err(fi << 212 if (!ret) << 213 ret = err; << 214 << 215 return ret; 208 return ret; 216 } 209 } 217 210 218 /* truncate: */ 211 /* truncate: */ 219 212 220 static inline int range_has_data(struct bch_fs 213 static inline int range_has_data(struct bch_fs *c, u32 subvol, 221 struct bpos s 214 struct bpos start, 222 struct bpos e 215 struct bpos end) 223 { 216 { 224 return bch2_trans_run(c, !! 217 struct btree_trans *trans = bch2_trans_get(c); 225 for_each_btree_key_in_subvolum !! 218 struct btree_iter iter; 226 !! 219 struct bkey_s_c k; 227 bkey_extent_is_data(k. !! 220 int ret = 0; 228 }))); !! 221 retry: >> 222 bch2_trans_begin(trans); >> 223 >> 224 ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); >> 225 if (ret) >> 226 goto err; >> 227 >> 228 for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) >> 229 if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { >> 230 ret = 1; >> 231 break; >> 232 } >> 233 start = iter.pos; >> 234 bch2_trans_iter_exit(trans, &iter); >> 235 err: >> 236 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) >> 237 goto retry; >> 238 >> 239 bch2_trans_put(trans); >> 240 return ret; 229 } 241 } 230 242 231 static int __bch2_truncate_folio(struct bch_in 243 static int __bch2_truncate_folio(struct bch_inode_info *inode, 232 pgoff_t index 244 pgoff_t index, loff_t start, loff_t end) 233 { 245 { 234 struct bch_fs *c = inode->v.i_sb->s_fs 246 struct bch_fs *c = inode->v.i_sb->s_fs_info; 235 struct address_space *mapping = inode- 247 struct address_space *mapping = inode->v.i_mapping; 236 struct bch_folio *s; 248 struct bch_folio *s; 237 unsigned start_offset; 249 unsigned start_offset; 238 unsigned end_offset; 250 unsigned end_offset; 239 unsigned i; 251 unsigned i; 240 struct folio *folio; 252 struct folio *folio; 241 s64 i_sectors_delta = 0; 253 s64 i_sectors_delta = 0; 242 int ret = 0; 254 int ret = 0; 243 u64 end_pos; 255 u64 end_pos; 244 256 245 folio = filemap_lock_folio(mapping, in 257 folio = filemap_lock_folio(mapping, index); 246 if (IS_ERR_OR_NULL(folio)) { 258 if (IS_ERR_OR_NULL(folio)) { 247 /* 259 /* 248 * XXX: we're doing two index 260 * XXX: we're doing two index lookups when we end up reading the 249 * folio 261 * folio 250 */ 262 */ 251 ret = range_has_data(c, inode- !! 263 ret = range_has_data(c, inode->ei_subvol, 252 POS(inode->v.i 264 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), 253 POS(inode->v.i 265 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); 254 if (ret <= 0) 266 if (ret <= 0) 255 return ret; 267 return ret; 256 268 257 folio = __filemap_get_folio(ma 269 folio = __filemap_get_folio(mapping, index, 258 FG 270 FGP_LOCK|FGP_CREAT, GFP_KERNEL); 259 if (IS_ERR_OR_NULL(folio)) { 271 if (IS_ERR_OR_NULL(folio)) { 260 ret = -ENOMEM; 272 ret = -ENOMEM; 261 goto out; 273 goto out; 262 } 274 } 263 } 275 } 264 276 265 BUG_ON(start >= folio_end_pos(folio 277 BUG_ON(start >= folio_end_pos(folio)); 266 BUG_ON(end <= folio_pos(folio)); 278 BUG_ON(end <= folio_pos(folio)); 267 279 268 start_offset = max(start, folio_pos 280 start_offset = max(start, folio_pos(folio)) - folio_pos(folio); 269 end_offset = min_t(u64, end, foli 281 end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); 270 282 271 /* Folio boundary? Nothing to do */ 283 /* Folio boundary? Nothing to do */ 272 if (start_offset == 0 && 284 if (start_offset == 0 && 273 end_offset == folio_size(folio)) { 285 end_offset == folio_size(folio)) { 274 ret = 0; 286 ret = 0; 275 goto unlock; 287 goto unlock; 276 } 288 } 277 289 278 s = bch2_folio_create(folio, 0); 290 s = bch2_folio_create(folio, 0); 279 if (!s) { 291 if (!s) { 280 ret = -ENOMEM; 292 ret = -ENOMEM; 281 goto unlock; 293 goto unlock; 282 } 294 } 283 295 284 if (!folio_test_uptodate(folio)) { 296 if (!folio_test_uptodate(folio)) { 285 ret = bch2_read_single_folio(f 297 ret = bch2_read_single_folio(folio, mapping); 286 if (ret) 298 if (ret) 287 goto unlock; 299 goto unlock; 288 } 300 } 289 301 290 ret = bch2_folio_set(c, inode_inum(ino 302 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 291 if (ret) 303 if (ret) 292 goto unlock; 304 goto unlock; 293 305 294 for (i = round_up(start_offset, block_ 306 for (i = round_up(start_offset, block_bytes(c)) >> 9; 295 i < round_down(end_offset, block_ 307 i < round_down(end_offset, block_bytes(c)) >> 9; 296 i++) { 308 i++) { 297 s->s[i].nr_replicas = 0; 309 s->s[i].nr_replicas = 0; 298 310 299 i_sectors_delta -= s->s[i].sta 311 i_sectors_delta -= s->s[i].state == SECTOR_dirty; 300 bch2_folio_sector_set(folio, s 312 bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); 301 } 313 } 302 314 303 bch2_i_sectors_acct(c, inode, NULL, i_ 315 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 304 316 305 /* 317 /* 306 * Caller needs to know whether this f 318 * Caller needs to know whether this folio will be written out by 307 * writeback - doing an i_size update 319 * writeback - doing an i_size update if necessary - or whether it will 308 * be responsible for the i_size updat 320 * be responsible for the i_size update. 309 * 321 * 310 * Note that we shouldn't ever see a f 322 * Note that we shouldn't ever see a folio beyond EOF, but check and 311 * warn if so. This has been observed 323 * warn if so. This has been observed by failure to clean up folios 312 * after a short write and there's sti 324 * after a short write and there's still a chance reclaim will fix 313 * things up. 325 * things up. 314 */ 326 */ 315 WARN_ON_ONCE(folio_pos(folio) >= inode 327 WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); 316 end_pos = folio_end_pos(folio); 328 end_pos = folio_end_pos(folio); 317 if (inode->v.i_size > folio_pos(folio) 329 if (inode->v.i_size > folio_pos(folio)) 318 end_pos = min_t(u64, inode->v. 330 end_pos = min_t(u64, inode->v.i_size, end_pos); 319 ret = s->s[folio_pos_to_s(folio, end_p 331 ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; 320 332 321 folio_zero_segment(folio, start_offset 333 folio_zero_segment(folio, start_offset, end_offset); 322 334 323 /* 335 /* 324 * Bit of a hack - we don't want trunc 336 * Bit of a hack - we don't want truncate to fail due to -ENOSPC. 325 * 337 * 326 * XXX: because we aren't currently tr 338 * XXX: because we aren't currently tracking whether the folio has actual 327 * data in it (vs. just 0s, or only pa 339 * data in it (vs. just 0s, or only partially written) this wrong. ick. 328 */ 340 */ 329 BUG_ON(bch2_get_folio_disk_reservation 341 BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); 330 342 331 /* 343 /* 332 * This removes any writeable userspac 344 * This removes any writeable userspace mappings; we need to force 333 * .page_mkwrite to be called again be 345 * .page_mkwrite to be called again before any mmapped writes, to 334 * redirty the full page: 346 * redirty the full page: 335 */ 347 */ 336 folio_mkclean(folio); 348 folio_mkclean(folio); 337 filemap_dirty_folio(mapping, folio); 349 filemap_dirty_folio(mapping, folio); 338 unlock: 350 unlock: 339 folio_unlock(folio); 351 folio_unlock(folio); 340 folio_put(folio); 352 folio_put(folio); 341 out: 353 out: 342 return ret; 354 return ret; 343 } 355 } 344 356 345 static int bch2_truncate_folio(struct bch_inod 357 static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) 346 { 358 { 347 return __bch2_truncate_folio(inode, fr 359 return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, 348 from, ANY 360 from, ANYSINT_MAX(loff_t)); 349 } 361 } 350 362 351 static int bch2_truncate_folios(struct bch_ino 363 static int bch2_truncate_folios(struct bch_inode_info *inode, 352 loff_t start, 364 loff_t start, loff_t end) 353 { 365 { 354 int ret = __bch2_truncate_folio(inode, 366 int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, 355 start, 367 start, end); 356 368 357 if (ret >= 0 && 369 if (ret >= 0 && 358 start >> PAGE_SHIFT != end >> PAGE 370 start >> PAGE_SHIFT != end >> PAGE_SHIFT) 359 ret = __bch2_truncate_folio(in 371 ret = __bch2_truncate_folio(inode, 360 (end - 372 (end - 1) >> PAGE_SHIFT, 361 start, 373 start, end); 362 return ret; 374 return ret; 363 } 375 } 364 376 365 static int bch2_extend(struct mnt_idmap *idmap 377 static int bch2_extend(struct mnt_idmap *idmap, 366 struct bch_inode_info * 378 struct bch_inode_info *inode, 367 struct bch_inode_unpack 379 struct bch_inode_unpacked *inode_u, 368 struct iattr *iattr) 380 struct iattr *iattr) 369 { 381 { 370 struct address_space *mapping = inode- 382 struct address_space *mapping = inode->v.i_mapping; 371 int ret; 383 int ret; 372 384 373 /* 385 /* 374 * sync appends: 386 * sync appends: 375 * 387 * 376 * this has to be done _before_ extend 388 * this has to be done _before_ extending i_size: 377 */ 389 */ 378 ret = filemap_write_and_wait_range(map 390 ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); 379 if (ret) 391 if (ret) 380 return ret; 392 return ret; 381 393 382 truncate_setsize(&inode->v, iattr->ia_ 394 truncate_setsize(&inode->v, iattr->ia_size); 383 395 384 return bch2_setattr_nonsize(idmap, ino 396 return bch2_setattr_nonsize(idmap, inode, iattr); 385 } 397 } 386 398 387 int bchfs_truncate(struct mnt_idmap *idmap, 399 int bchfs_truncate(struct mnt_idmap *idmap, 388 struct bch_inode_info *inode 400 struct bch_inode_info *inode, struct iattr *iattr) 389 { 401 { 390 struct bch_fs *c = inode->v.i_sb->s_fs 402 struct bch_fs *c = inode->v.i_sb->s_fs_info; 391 struct address_space *mapping = inode- 403 struct address_space *mapping = inode->v.i_mapping; 392 struct bch_inode_unpacked inode_u; 404 struct bch_inode_unpacked inode_u; 393 s64 i_sectors_delta = 0; 405 s64 i_sectors_delta = 0; 394 int ret = 0; 406 int ret = 0; 395 407 396 /* 408 /* 397 * If the truncate call with change th 409 * If the truncate call with change the size of the file, the 398 * cmtimes should be updated. If the s 410 * cmtimes should be updated. If the size will not change, we 399 * do not need to update the cmtimes. 411 * do not need to update the cmtimes. 400 */ 412 */ 401 if (iattr->ia_size != inode->v.i_size) 413 if (iattr->ia_size != inode->v.i_size) { 402 if (!(iattr->ia_valid & ATTR_M 414 if (!(iattr->ia_valid & ATTR_MTIME)) 403 ktime_get_coarse_real_ 415 ktime_get_coarse_real_ts64(&iattr->ia_mtime); 404 if (!(iattr->ia_valid & ATTR_C 416 if (!(iattr->ia_valid & ATTR_CTIME)) 405 ktime_get_coarse_real_ 417 ktime_get_coarse_real_ts64(&iattr->ia_ctime); 406 iattr->ia_valid |= ATTR_MTIME| 418 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; 407 } 419 } 408 420 409 inode_dio_wait(&inode->v); 421 inode_dio_wait(&inode->v); 410 bch2_pagecache_block_get(inode); 422 bch2_pagecache_block_get(inode); 411 423 412 ret = bch2_inode_find_by_inum(c, inode 424 ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); 413 if (ret) 425 if (ret) 414 goto err; 426 goto err; 415 427 416 /* 428 /* 417 * check this before next assertion; o 429 * check this before next assertion; on filesystem error our normal 418 * invariants are a bit broken (trunca 430 * invariants are a bit broken (truncate has to truncate the page cache 419 * before the inode). 431 * before the inode). 420 */ 432 */ 421 ret = bch2_journal_error(&c->journal); 433 ret = bch2_journal_error(&c->journal); 422 if (ret) 434 if (ret) 423 goto err; 435 goto err; 424 436 425 WARN_ONCE(!test_bit(EI_INODE_ERROR, &i 437 WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && 426 inode->v.i_size < inode_u.bi 438 inode->v.i_size < inode_u.bi_size, 427 "truncate spotted in mem i_s 439 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", 428 (u64) inode->v.i_size, inode 440 (u64) inode->v.i_size, inode_u.bi_size); 429 441 430 if (iattr->ia_size > inode->v.i_size) 442 if (iattr->ia_size > inode->v.i_size) { 431 ret = bch2_extend(idmap, inode 443 ret = bch2_extend(idmap, inode, &inode_u, iattr); 432 goto err; 444 goto err; 433 } 445 } 434 446 435 iattr->ia_valid &= ~ATTR_SIZE; 447 iattr->ia_valid &= ~ATTR_SIZE; 436 448 437 ret = bch2_truncate_folio(inode, iattr 449 ret = bch2_truncate_folio(inode, iattr->ia_size); 438 if (unlikely(ret < 0)) 450 if (unlikely(ret < 0)) 439 goto err; 451 goto err; 440 452 441 truncate_setsize(&inode->v, iattr->ia_ 453 truncate_setsize(&inode->v, iattr->ia_size); 442 454 443 /* 455 /* 444 * When extending, we're going to writ 456 * When extending, we're going to write the new i_size to disk 445 * immediately so we need to flush any 457 * immediately so we need to flush anything above the current on disk 446 * i_size first: 458 * i_size first: 447 * 459 * 448 * Also, when extending we need to flu 460 * Also, when extending we need to flush the page that i_size currently 449 * straddles - if it's mapped to users 461 * straddles - if it's mapped to userspace, we need to ensure that 450 * userspace has to redirty it and cal 462 * userspace has to redirty it and call .mkwrite -> set_page_dirty 451 * again to allocate the part of the p 463 * again to allocate the part of the page that was extended. 452 */ 464 */ 453 if (iattr->ia_size > inode_u.bi_size) 465 if (iattr->ia_size > inode_u.bi_size) 454 ret = filemap_write_and_wait_r 466 ret = filemap_write_and_wait_range(mapping, 455 inode_u.bi_siz 467 inode_u.bi_size, 456 iattr->ia_size 468 iattr->ia_size - 1); 457 else if (iattr->ia_size & (PAGE_SIZE - 469 else if (iattr->ia_size & (PAGE_SIZE - 1)) 458 ret = filemap_write_and_wait_r 470 ret = filemap_write_and_wait_range(mapping, 459 round_down(iat 471 round_down(iattr->ia_size, PAGE_SIZE), 460 iattr->ia_size 472 iattr->ia_size - 1); 461 if (ret) 473 if (ret) 462 goto err; 474 goto err; 463 475 464 ret = bch2_truncate(c, inode_inum(inod 476 ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); 465 bch2_i_sectors_acct(c, inode, NULL, i_ 477 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 466 478 467 if (unlikely(ret)) { 479 if (unlikely(ret)) { 468 /* 480 /* 469 * If we error here, VFS cache 481 * If we error here, VFS caches are now inconsistent with btree 470 */ 482 */ 471 set_bit(EI_INODE_ERROR, &inode 483 set_bit(EI_INODE_ERROR, &inode->ei_flags); 472 goto err; 484 goto err; 473 } 485 } 474 486 475 bch2_fs_inconsistent_on(!inode->v.i_si 487 bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && 476 !bch2_journal_ 488 !bch2_journal_error(&c->journal), c, 477 "inode %lu tru 489 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", 478 inode->v.i_ino 490 inode->v.i_ino, (u64) inode->v.i_blocks, 479 inode->ei_inod 491 inode->ei_inode.bi_sectors); 480 492 481 ret = bch2_setattr_nonsize(idmap, inod 493 ret = bch2_setattr_nonsize(idmap, inode, iattr); 482 err: 494 err: 483 bch2_pagecache_block_put(inode); 495 bch2_pagecache_block_put(inode); 484 return bch2_err_class(ret); 496 return bch2_err_class(ret); 485 } 497 } 486 498 487 /* fallocate: */ 499 /* fallocate: */ 488 500 489 static int inode_update_times_fn(struct btree_ 501 static int inode_update_times_fn(struct btree_trans *trans, 490 struct bch_in 502 struct bch_inode_info *inode, 491 struct bch_in 503 struct bch_inode_unpacked *bi, void *p) 492 { 504 { 493 struct bch_fs *c = inode->v.i_sb->s_fs 505 struct bch_fs *c = inode->v.i_sb->s_fs_info; 494 506 495 bi->bi_mtime = bi->bi_ctime = bch2_cur 507 bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); 496 return 0; 508 return 0; 497 } 509 } 498 510 499 static noinline long bchfs_fpunch(struct bch_i !! 511 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) 500 { 512 { 501 struct bch_fs *c = inode->v.i_sb->s_fs 513 struct bch_fs *c = inode->v.i_sb->s_fs_info; 502 u64 end = offset + len; 514 u64 end = offset + len; 503 u64 block_start = round_up(offset, blo 515 u64 block_start = round_up(offset, block_bytes(c)); 504 u64 block_end = round_down(end, bloc 516 u64 block_end = round_down(end, block_bytes(c)); 505 bool truncated_last_page; 517 bool truncated_last_page; 506 int ret = 0; 518 int ret = 0; 507 519 508 ret = bch2_truncate_folios(inode, offs 520 ret = bch2_truncate_folios(inode, offset, end); 509 if (unlikely(ret < 0)) 521 if (unlikely(ret < 0)) 510 goto err; 522 goto err; 511 523 512 truncated_last_page = ret; 524 truncated_last_page = ret; 513 525 514 truncate_pagecache_range(&inode->v, of 526 truncate_pagecache_range(&inode->v, offset, end - 1); 515 527 516 if (block_start < block_end) { 528 if (block_start < block_end) { 517 s64 i_sectors_delta = 0; 529 s64 i_sectors_delta = 0; 518 530 519 ret = bch2_fpunch(c, inode_inu 531 ret = bch2_fpunch(c, inode_inum(inode), 520 block_start 532 block_start >> 9, block_end >> 9, 521 &i_sectors_d 533 &i_sectors_delta); 522 bch2_i_sectors_acct(c, inode, 534 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 523 } 535 } 524 536 525 mutex_lock(&inode->ei_update_lock); 537 mutex_lock(&inode->ei_update_lock); 526 if (end >= inode->v.i_size && !truncat 538 if (end >= inode->v.i_size && !truncated_last_page) { 527 ret = bch2_write_inode_size(c, 539 ret = bch2_write_inode_size(c, inode, inode->v.i_size, 528 AT 540 ATTR_MTIME|ATTR_CTIME); 529 } else { 541 } else { 530 ret = bch2_write_inode(c, inod 542 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 531 ATTR_MT 543 ATTR_MTIME|ATTR_CTIME); 532 } 544 } 533 mutex_unlock(&inode->ei_update_lock); 545 mutex_unlock(&inode->ei_update_lock); 534 err: 546 err: 535 return ret; 547 return ret; 536 } 548 } 537 549 538 static noinline long bchfs_fcollapse_finsert(s !! 550 static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, 539 loff_t offs 551 loff_t offset, loff_t len, 540 bool insert 552 bool insert) 541 { 553 { 542 struct bch_fs *c = inode->v.i_sb->s_fs 554 struct bch_fs *c = inode->v.i_sb->s_fs_info; 543 struct address_space *mapping = inode- 555 struct address_space *mapping = inode->v.i_mapping; 544 s64 i_sectors_delta = 0; 556 s64 i_sectors_delta = 0; 545 int ret = 0; 557 int ret = 0; 546 558 547 if ((offset | len) & (block_bytes(c) - 559 if ((offset | len) & (block_bytes(c) - 1)) 548 return -EINVAL; 560 return -EINVAL; 549 561 550 if (insert) { 562 if (insert) { 551 if (offset >= inode->v.i_size) 563 if (offset >= inode->v.i_size) 552 return -EINVAL; 564 return -EINVAL; 553 } else { 565 } else { 554 if (offset + len >= inode->v.i 566 if (offset + len >= inode->v.i_size) 555 return -EINVAL; 567 return -EINVAL; 556 } 568 } 557 569 558 ret = bch2_write_invalidate_inode_page 570 ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 559 if (ret) 571 if (ret) 560 return ret; 572 return ret; 561 573 562 if (insert) 574 if (insert) 563 i_size_write(&inode->v, inode- 575 i_size_write(&inode->v, inode->v.i_size + len); 564 576 565 ret = bch2_fcollapse_finsert(c, inode_ 577 ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, 566 insert, & 578 insert, &i_sectors_delta); 567 if (!ret && !insert) 579 if (!ret && !insert) 568 i_size_write(&inode->v, inode- 580 i_size_write(&inode->v, inode->v.i_size - len); 569 bch2_i_sectors_acct(c, inode, NULL, i_ 581 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 570 582 571 return ret; 583 return ret; 572 } 584 } 573 585 574 static noinline int __bchfs_fallocate(struct b !! 586 static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, 575 u64 start_sector, 587 u64 start_sector, u64 end_sector) 576 { 588 { 577 struct bch_fs *c = inode->v.i_sb->s_fs 589 struct bch_fs *c = inode->v.i_sb->s_fs_info; 578 struct btree_trans *trans = bch2_trans 590 struct btree_trans *trans = bch2_trans_get(c); 579 struct btree_iter iter; 591 struct btree_iter iter; 580 struct bpos end_pos = POS(inode->v.i_i 592 struct bpos end_pos = POS(inode->v.i_ino, end_sector); 581 struct bch_io_opts opts; 593 struct bch_io_opts opts; 582 int ret = 0; 594 int ret = 0; 583 595 584 bch2_inode_opts_get(&opts, c, &inode-> 596 bch2_inode_opts_get(&opts, c, &inode->ei_inode); 585 597 586 bch2_trans_iter_init(trans, &iter, BTR 598 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 587 POS(inode->v.i_ino, st 599 POS(inode->v.i_ino, start_sector), 588 BTREE_ITER_slots|BTREE 600 BTREE_ITER_slots|BTREE_ITER_intent); 589 601 590 while (!ret) { !! 602 while (!ret && bkey_lt(iter.pos, end_pos)) { 591 s64 i_sectors_delta = 0; 603 s64 i_sectors_delta = 0; 592 struct quota_res quota_res = { 604 struct quota_res quota_res = { 0 }; 593 struct bkey_s_c k; 605 struct bkey_s_c k; 594 unsigned sectors; 606 unsigned sectors; 595 bool is_allocation; 607 bool is_allocation; 596 u64 hole_start, hole_end; 608 u64 hole_start, hole_end; 597 u32 snapshot; 609 u32 snapshot; 598 610 599 bch2_trans_begin(trans); 611 bch2_trans_begin(trans); 600 612 601 if (bkey_ge(iter.pos, end_pos) << 602 break; << 603 << 604 ret = bch2_subvolume_get_snaps 613 ret = bch2_subvolume_get_snapshot(trans, 605 inode- !! 614 inode->ei_subvol, &snapshot); 606 if (ret) 615 if (ret) 607 goto bkey_err; 616 goto bkey_err; 608 617 609 bch2_btree_iter_set_snapshot(& 618 bch2_btree_iter_set_snapshot(&iter, snapshot); 610 619 611 k = bch2_btree_iter_peek_slot( 620 k = bch2_btree_iter_peek_slot(&iter); 612 if ((ret = bkey_err(k))) 621 if ((ret = bkey_err(k))) 613 goto bkey_err; 622 goto bkey_err; 614 623 615 hole_start = iter.pos.off 624 hole_start = iter.pos.offset; 616 hole_end = bpos_min(k.k 625 hole_end = bpos_min(k.k->p, end_pos).offset; 617 is_allocation = bkey_extent_ 626 is_allocation = bkey_extent_is_allocation(k.k); 618 627 619 /* already reserved */ 628 /* already reserved */ 620 if (bkey_extent_is_reservation 629 if (bkey_extent_is_reservation(k) && 621 bch2_bkey_nr_ptrs_fully_al 630 bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 622 bch2_btree_iter_advanc 631 bch2_btree_iter_advance(&iter); 623 continue; 632 continue; 624 } 633 } 625 634 626 if (bkey_extent_is_data(k.k) & 635 if (bkey_extent_is_data(k.k) && 627 !(mode & FALLOC_FL_ZERO_RA 636 !(mode & FALLOC_FL_ZERO_RANGE)) { 628 bch2_btree_iter_advanc 637 bch2_btree_iter_advance(&iter); 629 continue; 638 continue; 630 } 639 } 631 640 632 if (!(mode & FALLOC_FL_ZERO_RA 641 if (!(mode & FALLOC_FL_ZERO_RANGE)) { 633 /* 642 /* 634 * Lock ordering - can 643 * Lock ordering - can't be holding btree locks while 635 * blocking on a folio 644 * blocking on a folio lock: 636 */ 645 */ 637 if (bch2_clamp_data_ho 646 if (bch2_clamp_data_hole(&inode->v, 638 647 &hole_start, 639 648 &hole_end, 640 !! 649 opts.data_replicas, true)) 641 ret = drop_loc 650 ret = drop_locks_do(trans, 642 (bch2_ 651 (bch2_clamp_data_hole(&inode->v, 643 652 &hole_start, 644 653 &hole_end, 645 654 opts.data_replicas, false), 0)); 646 if (ret) << 647 goto b << 648 } << 649 bch2_btree_iter_set_po 655 bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); 650 656 651 if (ret) 657 if (ret) 652 goto bkey_err; 658 goto bkey_err; 653 659 654 if (hole_start == hole 660 if (hole_start == hole_end) 655 continue; 661 continue; 656 } 662 } 657 663 658 sectors = hole_end - hole_star 664 sectors = hole_end - hole_start; 659 665 660 if (!is_allocation) { 666 if (!is_allocation) { 661 ret = bch2_quota_reser 667 ret = bch2_quota_reservation_add(c, inode, 662 "a 668 "a_res, sectors, true); 663 if (unlikely(ret)) 669 if (unlikely(ret)) 664 goto bkey_err; 670 goto bkey_err; 665 } 671 } 666 672 667 ret = bch2_extent_fallocate(tr 673 ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, 668 se 674 sectors, opts, &i_sectors_delta, 669 wr 675 writepoint_hashed((unsigned long) current)); 670 if (ret) 676 if (ret) 671 goto bkey_err; 677 goto bkey_err; 672 678 673 bch2_i_sectors_acct(c, inode, 679 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 674 680 675 if (bch2_mark_pagecache_reserv 681 if (bch2_mark_pagecache_reserved(inode, &hole_start, 676 !! 682 iter.pos.offset, true)) 677 ret = drop_locks_do(tr !! 683 drop_locks_do(trans, 678 bch2_mark_page 684 bch2_mark_pagecache_reserved(inode, &hole_start, 679 685 iter.pos.offset, false)); 680 if (ret) << 681 goto bkey_err; << 682 } << 683 bkey_err: 686 bkey_err: 684 bch2_quota_reservation_put(c, 687 bch2_quota_reservation_put(c, inode, "a_res); 685 if (bch2_err_matches(ret, BCH_ 688 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 686 ret = 0; 689 ret = 0; 687 } 690 } 688 691 689 if (bch2_err_matches(ret, ENOSPC) && ( 692 if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { 690 struct quota_res quota_res = { 693 struct quota_res quota_res = { 0 }; 691 s64 i_sectors_delta = 0; 694 s64 i_sectors_delta = 0; 692 695 693 bch2_fpunch_at(trans, &iter, i 696 bch2_fpunch_at(trans, &iter, inode_inum(inode), 694 end_sector, &i_ 697 end_sector, &i_sectors_delta); 695 bch2_i_sectors_acct(c, inode, 698 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 696 bch2_quota_reservation_put(c, 699 bch2_quota_reservation_put(c, inode, "a_res); 697 } 700 } 698 701 699 bch2_trans_iter_exit(trans, &iter); 702 bch2_trans_iter_exit(trans, &iter); 700 bch2_trans_put(trans); 703 bch2_trans_put(trans); 701 return ret; 704 return ret; 702 } 705 } 703 706 704 static noinline long bchfs_fallocate(struct bc !! 707 static long bchfs_fallocate(struct bch_inode_info *inode, int mode, 705 loff_t offset, lof 708 loff_t offset, loff_t len) 706 { 709 { 707 struct bch_fs *c = inode->v.i_sb->s_fs 710 struct bch_fs *c = inode->v.i_sb->s_fs_info; 708 u64 end = offset + len; 711 u64 end = offset + len; 709 u64 block_start = round_down(offset, 712 u64 block_start = round_down(offset, block_bytes(c)); 710 u64 block_end = round_up(end, 713 u64 block_end = round_up(end, block_bytes(c)); 711 bool truncated_last_page = false; 714 bool truncated_last_page = false; 712 int ret, ret2 = 0; 715 int ret, ret2 = 0; 713 716 714 if (!(mode & FALLOC_FL_KEEP_SIZE) && e 717 if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { 715 ret = inode_newsize_ok(&inode- 718 ret = inode_newsize_ok(&inode->v, end); 716 if (ret) 719 if (ret) 717 return ret; 720 return ret; 718 } 721 } 719 722 720 if (mode & FALLOC_FL_ZERO_RANGE) { 723 if (mode & FALLOC_FL_ZERO_RANGE) { 721 ret = bch2_truncate_folios(ino 724 ret = bch2_truncate_folios(inode, offset, end); 722 if (unlikely(ret < 0)) 725 if (unlikely(ret < 0)) 723 return ret; 726 return ret; 724 727 725 truncated_last_page = ret; 728 truncated_last_page = ret; 726 729 727 truncate_pagecache_range(&inod 730 truncate_pagecache_range(&inode->v, offset, end - 1); 728 731 729 block_start = round_up(off 732 block_start = round_up(offset, block_bytes(c)); 730 block_end = round_down(e 733 block_end = round_down(end, block_bytes(c)); 731 } 734 } 732 735 733 ret = __bchfs_fallocate(inode, mode, b 736 ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); 734 737 735 /* 738 /* 736 * On -ENOSPC in ZERO_RANGE mode, we s 739 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, 737 * so that the VFS cache i_size is con 740 * so that the VFS cache i_size is consistent with the btree i_size: 738 */ 741 */ 739 if (ret && 742 if (ret && 740 !(bch2_err_matches(ret, ENOSPC) && 743 !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) 741 return ret; 744 return ret; 742 745 743 if (mode & FALLOC_FL_KEEP_SIZE && end 746 if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) 744 end = inode->v.i_size; 747 end = inode->v.i_size; 745 748 746 if (end >= inode->v.i_size && 749 if (end >= inode->v.i_size && 747 (((mode & FALLOC_FL_ZERO_RANGE) && 750 (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || 748 !(mode & FALLOC_FL_KEEP_SIZE))) { 751 !(mode & FALLOC_FL_KEEP_SIZE))) { 749 spin_lock(&inode->v.i_lock); 752 spin_lock(&inode->v.i_lock); 750 i_size_write(&inode->v, end); 753 i_size_write(&inode->v, end); 751 spin_unlock(&inode->v.i_lock); 754 spin_unlock(&inode->v.i_lock); 752 755 753 mutex_lock(&inode->ei_update_l 756 mutex_lock(&inode->ei_update_lock); 754 ret2 = bch2_write_inode_size(c 757 ret2 = bch2_write_inode_size(c, inode, end, 0); 755 mutex_unlock(&inode->ei_update 758 mutex_unlock(&inode->ei_update_lock); 756 } 759 } 757 760 758 return ret ?: ret2; 761 return ret ?: ret2; 759 } 762 } 760 763 761 long bch2_fallocate_dispatch(struct file *file 764 long bch2_fallocate_dispatch(struct file *file, int mode, 762 loff_t offset, lo 765 loff_t offset, loff_t len) 763 { 766 { 764 struct bch_inode_info *inode = file_bc 767 struct bch_inode_info *inode = file_bch_inode(file); 765 struct bch_fs *c = inode->v.i_sb->s_fs 768 struct bch_fs *c = inode->v.i_sb->s_fs_info; 766 long ret; 769 long ret; 767 770 768 if (!bch2_write_ref_tryget(c, BCH_WRIT 771 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) 769 return -EROFS; 772 return -EROFS; 770 773 771 inode_lock(&inode->v); 774 inode_lock(&inode->v); 772 inode_dio_wait(&inode->v); 775 inode_dio_wait(&inode->v); 773 bch2_pagecache_block_get(inode); 776 bch2_pagecache_block_get(inode); 774 777 775 ret = file_modified(file); 778 ret = file_modified(file); 776 if (ret) 779 if (ret) 777 goto err; 780 goto err; 778 781 779 if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FAL 782 if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) 780 ret = bchfs_fallocate(inode, m 783 ret = bchfs_fallocate(inode, mode, offset, len); 781 else if (mode == (FALLOC_FL_PUNCH_HOLE 784 else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) 782 ret = bchfs_fpunch(inode, offs 785 ret = bchfs_fpunch(inode, offset, len); 783 else if (mode == FALLOC_FL_INSERT_RANG 786 else if (mode == FALLOC_FL_INSERT_RANGE) 784 ret = bchfs_fcollapse_finsert( 787 ret = bchfs_fcollapse_finsert(inode, offset, len, true); 785 else if (mode == FALLOC_FL_COLLAPSE_RA 788 else if (mode == FALLOC_FL_COLLAPSE_RANGE) 786 ret = bchfs_fcollapse_finsert( 789 ret = bchfs_fcollapse_finsert(inode, offset, len, false); 787 else 790 else 788 ret = -EOPNOTSUPP; 791 ret = -EOPNOTSUPP; 789 err: 792 err: 790 bch2_pagecache_block_put(inode); 793 bch2_pagecache_block_put(inode); 791 inode_unlock(&inode->v); 794 inode_unlock(&inode->v); 792 bch2_write_ref_put(c, BCH_WRITE_REF_fa 795 bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); 793 796 794 return bch2_err_class(ret); 797 return bch2_err_class(ret); 795 } 798 } 796 799 797 /* 800 /* 798 * Take a quota reservation for unallocated bl 801 * Take a quota reservation for unallocated blocks in a given file range 799 * Does not check pagecache 802 * Does not check pagecache 800 */ 803 */ 801 static int quota_reserve_range(struct bch_inod 804 static int quota_reserve_range(struct bch_inode_info *inode, 802 struct quota_re 805 struct quota_res *res, 803 u64 start, u64 806 u64 start, u64 end) 804 { 807 { 805 struct bch_fs *c = inode->v.i_sb->s_fs 808 struct bch_fs *c = inode->v.i_sb->s_fs_info; >> 809 struct btree_trans *trans = bch2_trans_get(c); >> 810 struct btree_iter iter; >> 811 struct bkey_s_c k; >> 812 u32 snapshot; 806 u64 sectors = end - start; 813 u64 sectors = end - start; >> 814 u64 pos = start; >> 815 int ret; >> 816 retry: >> 817 bch2_trans_begin(trans); >> 818 >> 819 ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); >> 820 if (ret) >> 821 goto err; 807 822 808 int ret = bch2_trans_run(c, !! 823 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 809 for_each_btree_key_in_subvolum !! 824 SPOS(inode->v.i_ino, pos, snapshot), 0); 810 BTREE_ID_exten !! 825 811 POS(inode->v.i !! 826 while (!(ret = btree_trans_too_many_iters(trans)) && 812 POS(inode->v.i !! 827 (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && 813 inode->ei_inum !! 828 !(ret = bkey_err(k))) { 814 if (bkey_extent_is_all !! 829 if (bkey_extent_is_allocation(k.k)) { 815 u64 s = min(en !! 830 u64 s = min(end, k.k->p.offset) - 816 max(st !! 831 max(start, bkey_start_offset(k.k)); 817 BUG_ON(s > sec !! 832 BUG_ON(s > sectors); 818 sectors -= s; !! 833 sectors -= s; 819 } !! 834 } >> 835 bch2_btree_iter_advance(&iter); >> 836 } >> 837 pos = iter.pos.offset; >> 838 bch2_trans_iter_exit(trans, &iter); >> 839 err: >> 840 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) >> 841 goto retry; 820 842 821 0; !! 843 bch2_trans_put(trans); 822 }))); << 823 844 824 return ret ?: bch2_quota_reservation_a 845 return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); 825 } 846 } 826 847 827 loff_t bch2_remap_file_range(struct file *file 848 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, 828 struct file *file 849 struct file *file_dst, loff_t pos_dst, 829 loff_t len, unsig 850 loff_t len, unsigned remap_flags) 830 { 851 { 831 struct bch_inode_info *src = file_bch_ 852 struct bch_inode_info *src = file_bch_inode(file_src); 832 struct bch_inode_info *dst = file_bch_ 853 struct bch_inode_info *dst = file_bch_inode(file_dst); 833 struct bch_fs *c = src->v.i_sb->s_fs_i 854 struct bch_fs *c = src->v.i_sb->s_fs_info; 834 struct quota_res quota_res = { 0 }; 855 struct quota_res quota_res = { 0 }; 835 s64 i_sectors_delta = 0; 856 s64 i_sectors_delta = 0; 836 u64 aligned_len; 857 u64 aligned_len; 837 loff_t ret = 0; 858 loff_t ret = 0; 838 859 839 if (remap_flags & ~(REMAP_FILE_DEDUP|R 860 if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) 840 return -EINVAL; 861 return -EINVAL; 841 862 >> 863 if (remap_flags & REMAP_FILE_DEDUP) >> 864 return -EOPNOTSUPP; >> 865 842 if ((pos_src & (block_bytes(c) - 1)) | 866 if ((pos_src & (block_bytes(c) - 1)) || 843 (pos_dst & (block_bytes(c) - 1))) 867 (pos_dst & (block_bytes(c) - 1))) 844 return -EINVAL; 868 return -EINVAL; 845 869 846 if (src == dst && 870 if (src == dst && 847 abs(pos_src - pos_dst) < len) 871 abs(pos_src - pos_dst) < len) 848 return -EINVAL; 872 return -EINVAL; 849 873 850 lock_two_nondirectories(&src->v, &dst- 874 lock_two_nondirectories(&src->v, &dst->v); 851 bch2_lock_inodes(INODE_PAGECACHE_BLOCK 875 bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 852 876 853 inode_dio_wait(&src->v); 877 inode_dio_wait(&src->v); 854 inode_dio_wait(&dst->v); 878 inode_dio_wait(&dst->v); 855 879 856 ret = generic_remap_file_range_prep(fi 880 ret = generic_remap_file_range_prep(file_src, pos_src, 857 fi 881 file_dst, pos_dst, 858 &l 882 &len, remap_flags); 859 if (ret < 0 || len == 0) 883 if (ret < 0 || len == 0) 860 goto err; 884 goto err; 861 885 862 aligned_len = round_up((u64) len, bloc 886 aligned_len = round_up((u64) len, block_bytes(c)); 863 887 864 ret = bch2_write_invalidate_inode_page 888 ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, 865 pos_dst, pos_d 889 pos_dst, pos_dst + len - 1); 866 if (ret) 890 if (ret) 867 goto err; 891 goto err; 868 892 869 ret = quota_reserve_range(dst, "a_ 893 ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, 870 (pos_dst + a 894 (pos_dst + aligned_len) >> 9); 871 if (ret) 895 if (ret) 872 goto err; 896 goto err; 873 897 874 if (!(remap_flags & REMAP_FILE_DEDUP)) !! 898 file_update_time(file_dst); 875 file_update_time(file_dst); << 876 899 877 bch2_mark_pagecache_unallocated(src, p 900 bch2_mark_pagecache_unallocated(src, pos_src >> 9, 878 (pos_src + 901 (pos_src + aligned_len) >> 9); 879 902 880 ret = bch2_remap_range(c, 903 ret = bch2_remap_range(c, 881 inode_inum(dst) 904 inode_inum(dst), pos_dst >> 9, 882 inode_inum(src) 905 inode_inum(src), pos_src >> 9, 883 aligned_len >> 906 aligned_len >> 9, 884 pos_dst + len, 907 pos_dst + len, &i_sectors_delta); 885 if (ret < 0) 908 if (ret < 0) 886 goto err; 909 goto err; 887 910 888 /* 911 /* 889 * due to alignment, we might have rem 912 * due to alignment, we might have remapped slightly more than requsted 890 */ 913 */ 891 ret = min((u64) ret << 9, (u64) len); 914 ret = min((u64) ret << 9, (u64) len); 892 915 893 bch2_i_sectors_acct(c, dst, "a_res 916 bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); 894 917 895 spin_lock(&dst->v.i_lock); 918 spin_lock(&dst->v.i_lock); 896 if (pos_dst + ret > dst->v.i_size) 919 if (pos_dst + ret > dst->v.i_size) 897 i_size_write(&dst->v, pos_dst 920 i_size_write(&dst->v, pos_dst + ret); 898 spin_unlock(&dst->v.i_lock); 921 spin_unlock(&dst->v.i_lock); 899 922 900 if ((file_dst->f_flags & (__O_SYNC | O 923 if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 901 IS_SYNC(file_inode(file_dst))) 924 IS_SYNC(file_inode(file_dst))) 902 ret = bch2_flush_inode(c, dst) 925 ret = bch2_flush_inode(c, dst); 903 err: 926 err: 904 bch2_quota_reservation_put(c, dst, &qu 927 bch2_quota_reservation_put(c, dst, "a_res); 905 bch2_unlock_inodes(INODE_PAGECACHE_BLO 928 bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 906 unlock_two_nondirectories(&src->v, &ds 929 unlock_two_nondirectories(&src->v, &dst->v); 907 930 908 return bch2_err_class(ret); 931 return bch2_err_class(ret); 909 } 932 } 910 933 911 /* fseek: */ 934 /* fseek: */ 912 935 913 static loff_t bch2_seek_data(struct file *file 936 static loff_t bch2_seek_data(struct file *file, u64 offset) 914 { 937 { 915 struct bch_inode_info *inode = file_bc 938 struct bch_inode_info *inode = file_bch_inode(file); 916 struct bch_fs *c = inode->v.i_sb->s_fs 939 struct bch_fs *c = inode->v.i_sb->s_fs_info; >> 940 struct btree_trans *trans; >> 941 struct btree_iter iter; >> 942 struct bkey_s_c k; 917 subvol_inum inum = inode_inum(inode); 943 subvol_inum inum = inode_inum(inode); 918 u64 isize, next_data = MAX_LFS_FILESIZ 944 u64 isize, next_data = MAX_LFS_FILESIZE; >> 945 u32 snapshot; >> 946 int ret; 919 947 920 isize = i_size_read(&inode->v); 948 isize = i_size_read(&inode->v); 921 if (offset >= isize) 949 if (offset >= isize) 922 return -ENXIO; 950 return -ENXIO; 923 951 924 int ret = bch2_trans_run(c, !! 952 trans = bch2_trans_get(c); 925 for_each_btree_key_in_subvolum !! 953 retry: 926 POS(inode-> !! 954 bch2_trans_begin(trans); 927 POS(inode-> !! 955 928 inum.subvol !! 956 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 929 if (bkey_extent_is_dat !! 957 if (ret) 930 next_data = ma !! 958 goto err; 931 break; !! 959 932 } else if (k.k->p.offs !! 960 for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, 933 break; !! 961 SPOS(inode->v.i_ino, offset >> 9, snapshot), 934 0; !! 962 POS(inode->v.i_ino, U64_MAX), 935 }))); !! 963 0, k, ret) { >> 964 if (bkey_extent_is_data(k.k)) { >> 965 next_data = max(offset, bkey_start_offset(k.k) << 9); >> 966 break; >> 967 } else if (k.k->p.offset >> 9 > isize) >> 968 break; >> 969 } >> 970 bch2_trans_iter_exit(trans, &iter); >> 971 err: >> 972 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) >> 973 goto retry; >> 974 >> 975 bch2_trans_put(trans); 936 if (ret) 976 if (ret) 937 return ret; 977 return ret; 938 978 939 if (next_data > offset) 979 if (next_data > offset) 940 next_data = bch2_seek_pagecach 980 next_data = bch2_seek_pagecache_data(&inode->v, 941 offset 981 offset, next_data, 0, false); 942 982 943 if (next_data >= isize) 983 if (next_data >= isize) 944 return -ENXIO; 984 return -ENXIO; 945 985 946 return vfs_setpos(file, next_data, MAX 986 return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 947 } 987 } 948 988 949 static loff_t bch2_seek_hole(struct file *file 989 static loff_t bch2_seek_hole(struct file *file, u64 offset) 950 { 990 { 951 struct bch_inode_info *inode = file_bc 991 struct bch_inode_info *inode = file_bch_inode(file); 952 struct bch_fs *c = inode->v.i_sb->s_fs 992 struct bch_fs *c = inode->v.i_sb->s_fs_info; >> 993 struct btree_trans *trans; >> 994 struct btree_iter iter; >> 995 struct bkey_s_c k; 953 subvol_inum inum = inode_inum(inode); 996 subvol_inum inum = inode_inum(inode); 954 u64 isize, next_hole = MAX_LFS_FILESIZ 997 u64 isize, next_hole = MAX_LFS_FILESIZE; >> 998 u32 snapshot; >> 999 int ret; 955 1000 956 isize = i_size_read(&inode->v); 1001 isize = i_size_read(&inode->v); 957 if (offset >= isize) 1002 if (offset >= isize) 958 return -ENXIO; 1003 return -ENXIO; 959 1004 960 int ret = bch2_trans_run(c, !! 1005 trans = bch2_trans_get(c); 961 for_each_btree_key_in_subvolum !! 1006 retry: 962 POS(inode-> !! 1007 bch2_trans_begin(trans); 963 POS(inode-> !! 1008 964 inum.subvol !! 1009 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 965 if (k.k->p.inode != in !! 1010 if (ret) 966 next_hole = bc !! 1011 goto err; 967 !! 1012 >> 1013 for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, >> 1014 SPOS(inode->v.i_ino, offset >> 9, snapshot), >> 1015 BTREE_ITER_slots, k, ret) { >> 1016 if (k.k->p.inode != inode->v.i_ino) { >> 1017 next_hole = bch2_seek_pagecache_hole(&inode->v, >> 1018 offset, MAX_LFS_FILESIZE, 0, false); >> 1019 break; >> 1020 } else if (!bkey_extent_is_data(k.k)) { >> 1021 next_hole = bch2_seek_pagecache_hole(&inode->v, >> 1022 max(offset, bkey_start_offset(k.k) << 9), >> 1023 k.k->p.offset << 9, 0, false); >> 1024 >> 1025 if (next_hole < k.k->p.offset << 9) 968 break; 1026 break; 969 } else if (!bkey_exten !! 1027 } else { 970 next_hole = bc !! 1028 offset = max(offset, bkey_start_offset(k.k) << 9); 971 !! 1029 } 972 !! 1030 } 973 !! 1031 bch2_trans_iter_exit(trans, &iter); 974 if (next_hole !! 1032 err: 975 break; !! 1033 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 976 } else { !! 1034 goto retry; 977 offset = max(o !! 1035 978 } !! 1036 bch2_trans_put(trans); 979 0; << 980 }))); << 981 if (ret) 1037 if (ret) 982 return ret; 1038 return ret; 983 1039 984 if (next_hole > isize) 1040 if (next_hole > isize) 985 next_hole = isize; 1041 next_hole = isize; 986 1042 987 return vfs_setpos(file, next_hole, MAX 1043 return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); 988 } 1044 } 989 1045 990 loff_t bch2_llseek(struct file *file, loff_t o 1046 loff_t bch2_llseek(struct file *file, loff_t offset, int whence) 991 { 1047 { 992 loff_t ret; 1048 loff_t ret; 993 1049 994 switch (whence) { 1050 switch (whence) { 995 case SEEK_SET: 1051 case SEEK_SET: 996 case SEEK_CUR: 1052 case SEEK_CUR: 997 case SEEK_END: 1053 case SEEK_END: 998 ret = generic_file_llseek(file 1054 ret = generic_file_llseek(file, offset, whence); 999 break; 1055 break; 1000 case SEEK_DATA: 1056 case SEEK_DATA: 1001 ret = bch2_seek_data(file, of 1057 ret = bch2_seek_data(file, offset); 1002 break; 1058 break; 1003 case SEEK_HOLE: 1059 case SEEK_HOLE: 1004 ret = bch2_seek_hole(file, of 1060 ret = bch2_seek_hole(file, offset); 1005 break; 1061 break; 1006 default: 1062 default: 1007 ret = -EINVAL; 1063 ret = -EINVAL; 1008 break; 1064 break; 1009 } 1065 } 1010 1066 1011 return bch2_err_class(ret); 1067 return bch2_err_class(ret); 1012 } 1068 } 1013 1069 1014 void bch2_fs_fsio_exit(struct bch_fs *c) 1070 void bch2_fs_fsio_exit(struct bch_fs *c) 1015 { 1071 { 1016 bioset_exit(&c->nocow_flush_bioset); 1072 bioset_exit(&c->nocow_flush_bioset); 1017 } 1073 } 1018 1074 1019 int bch2_fs_fsio_init(struct bch_fs *c) 1075 int bch2_fs_fsio_init(struct bch_fs *c) 1020 { 1076 { 1021 if (bioset_init(&c->nocow_flush_biose 1077 if (bioset_init(&c->nocow_flush_bioset, 1022 1, offsetof(struct no 1078 1, offsetof(struct nocow_flush, bio), 0)) 1023 return -BCH_ERR_ENOMEM_nocow_ 1079 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; 1024 1080 1025 return 0; 1081 return 0; 1026 } 1082 } 1027 1083 1028 #endif /* NO_BCACHEFS_FS */ 1084 #endif /* NO_BCACHEFS_FS */ 1029 1085
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.