1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * Copyright 2018 Omnibond Systems, L.L.C. 5 * 6 * See COPYING in top-level directory. 7 */ 8 9 /* 10 * Linux VFS file operations. 11 */ 12 13 #include "protocol.h" 14 #include "orangefs-kernel.h" 15 #include "orangefs-bufmap.h" 16 #include <linux/fs.h> 17 #include <linux/filelock.h> 18 #include <linux/pagemap.h> 19 20 static int flush_racache(struct inode *inode) 21 { 22 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 23 struct orangefs_kernel_op_s *new_op; 24 int ret; 25 26 gossip_debug(GOSSIP_UTILS_DEBUG, 27 "%s: %pU: Handle is %pU | fs_id %d\n", __func__, 28 get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, 29 orangefs_inode->refn.fs_id); 30 31 new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); 32 if (!new_op) 33 return -ENOMEM; 34 new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; 35 36 ret = service_operation(new_op, "orangefs_flush_racache", 37 get_interruptible_flag(inode)); 38 39 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", 40 __func__, ret); 41 42 op_release(new_op); 43 return ret; 44 } 45 46 /* 47 * Post and wait for the I/O upcall to finish 48 */ 49 ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 50 loff_t *offset, struct iov_iter *iter, size_t total_size, 51 loff_t readahead_size, struct orangefs_write_range *wr, 52 int *index_return, struct file *file) 53 { 54 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 55 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 56 struct orangefs_kernel_op_s *new_op = NULL; 57 int buffer_index; 58 ssize_t ret; 59 size_t copy_amount; 60 int open_for_read; 61 int open_for_write; 62 63 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 64 if (!new_op) 65 return -ENOMEM; 66 67 /* synchronous I/O */ 68 new_op->upcall.req.io.readahead_size = readahead_size; 69 new_op->upcall.req.io.io_type = type; 70 new_op->upcall.req.io.refn = orangefs_inode->refn; 71 72 populate_shared_memory: 73 /* get a shared buffer index */ 74 buffer_index = orangefs_bufmap_get(); 75 if (buffer_index < 0) { 76 ret = buffer_index; 77 gossip_debug(GOSSIP_FILE_DEBUG, 78 "%s: orangefs_bufmap_get failure (%zd)\n", 79 __func__, ret); 80 goto out; 81 } 82 gossip_debug(GOSSIP_FILE_DEBUG, 83 "%s(%pU): GET op %p -> buffer_index %d\n", 84 __func__, 85 handle, 86 new_op, 87 buffer_index); 88 89 new_op->uses_shared_memory = 1; 90 new_op->upcall.req.io.buf_index = buffer_index; 91 new_op->upcall.req.io.count = total_size; 92 new_op->upcall.req.io.offset = *offset; 93 if (type == ORANGEFS_IO_WRITE && wr) { 94 new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); 95 new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); 96 } 97 /* 98 * Orangefs has no open, and orangefs checks file permissions 99 * on each file access. Posix requires that file permissions 100 * be checked on open and nowhere else. Orangefs-through-the-kernel 101 * needs to seem posix compliant. 102 * 103 * The VFS opens files, even if the filesystem provides no 104 * method. We can see if a file was successfully opened for 105 * read and or for write by looking at file->f_mode. 106 * 107 * When writes are flowing from the page cache, file is no 108 * longer available. We can trust the VFS to have checked 109 * file->f_mode before writing to the page cache. 110 * 111 * The mode of a file might change between when it is opened 112 * and IO commences, or it might be created with an arbitrary mode. 113 * 114 * We'll make sure we don't hit EACCES during the IO stage by 115 * using UID 0. Some of the time we have access without changing 116 * to UID 0 - how to check? 117 */ 118 if (file) { 119 open_for_write = file->f_mode & FMODE_WRITE; 120 open_for_read = file->f_mode & FMODE_READ; 121 } else { 122 open_for_write = 1; 123 open_for_read = 0; /* not relevant? */ 124 } 125 if ((type == ORANGEFS_IO_WRITE) && open_for_write) 126 new_op->upcall.uid = 0; 127 if ((type == ORANGEFS_IO_READ) && open_for_read) 128 new_op->upcall.uid = 0; 129 130 gossip_debug(GOSSIP_FILE_DEBUG, 131 "%s(%pU): offset: %llu total_size: %zd\n", 132 __func__, 133 handle, 134 llu(*offset), 135 total_size); 136 /* 137 * Stage 1: copy the buffers into client-core's address space 138 */ 139 if (type == ORANGEFS_IO_WRITE && total_size) { 140 ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, 141 total_size); 142 if (ret < 0) { 143 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 144 __func__, (long)ret); 145 goto out; 146 } 147 } 148 149 gossip_debug(GOSSIP_FILE_DEBUG, 150 "%s(%pU): Calling post_io_request with tag (%llu)\n", 151 __func__, 152 handle, 153 llu(new_op->tag)); 154 155 /* Stage 2: Service the I/O operation */ 156 ret = service_operation(new_op, 157 type == ORANGEFS_IO_WRITE ? 158 "file_write" : 159 "file_read", 160 get_interruptible_flag(inode)); 161 162 /* 163 * If service_operation() returns -EAGAIN #and# the operation was 164 * purged from orangefs_request_list or htable_ops_in_progress, then 165 * we know that the client was restarted, causing the shared memory 166 * area to be wiped clean. To restart a write operation in this 167 * case, we must re-copy the data from the user's iovec to a NEW 168 * shared memory location. To restart a read operation, we must get 169 * a new shared memory location. 170 */ 171 if (ret == -EAGAIN && op_state_purged(new_op)) { 172 orangefs_bufmap_put(buffer_index); 173 if (type == ORANGEFS_IO_WRITE) 174 iov_iter_revert(iter, total_size); 175 gossip_debug(GOSSIP_FILE_DEBUG, 176 "%s:going to repopulate_shared_memory.\n", 177 __func__); 178 goto populate_shared_memory; 179 } 180 181 if (ret < 0) { 182 if (ret == -EINTR) { 183 /* 184 * We can't return EINTR if any data was written, 185 * it's not POSIX. It is minimally acceptable 186 * to give a partial write, the way NFS does. 187 * 188 * It would be optimal to return all or nothing, 189 * but if a userspace write is bigger than 190 * an IO buffer, and the interrupt occurs 191 * between buffer writes, that would not be 192 * possible. 193 */ 194 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 195 /* 196 * If the op was waiting when the interrupt 197 * occurred, then the client-core did not 198 * trigger the write. 199 */ 200 case OP_VFS_STATE_WAITING: 201 if (*offset == 0) 202 ret = -EINTR; 203 else 204 ret = 0; 205 break; 206 /* 207 * If the op was in progress when the interrupt 208 * occurred, then the client-core was able to 209 * trigger the write. 210 */ 211 case OP_VFS_STATE_INPROGR: 212 if (type == ORANGEFS_IO_READ) 213 ret = -EINTR; 214 else 215 ret = total_size; 216 break; 217 default: 218 gossip_err("%s: unexpected op state :%d:.\n", 219 __func__, 220 new_op->op_state); 221 ret = 0; 222 break; 223 } 224 gossip_debug(GOSSIP_FILE_DEBUG, 225 "%s: got EINTR, state:%d: %p\n", 226 __func__, 227 new_op->op_state, 228 new_op); 229 } else { 230 gossip_err("%s: error in %s handle %pU, returning %zd\n", 231 __func__, 232 type == ORANGEFS_IO_READ ? 233 "read from" : "write to", 234 handle, ret); 235 } 236 if (orangefs_cancel_op_in_progress(new_op)) 237 return ret; 238 239 goto out; 240 } 241 242 /* 243 * Stage 3: Post copy buffers from client-core's address space 244 */ 245 if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { 246 /* 247 * NOTE: the iovector can either contain addresses which 248 * can futher be kernel-space or user-space addresses. 249 * or it can pointers to struct page's 250 */ 251 252 copy_amount = new_op->downcall.resp.io.amt_complete; 253 254 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, 255 copy_amount); 256 if (ret < 0) { 257 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 258 __func__, (long)ret); 259 goto out; 260 } 261 } 262 gossip_debug(GOSSIP_FILE_DEBUG, 263 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 264 __func__, 265 handle, 266 type == ORANGEFS_IO_READ ? "read" : "written", 267 (int)new_op->downcall.resp.io.amt_complete); 268 269 ret = new_op->downcall.resp.io.amt_complete; 270 271 out: 272 if (buffer_index >= 0) { 273 orangefs_bufmap_put(buffer_index); 274 gossip_debug(GOSSIP_FILE_DEBUG, 275 "%s(%pU): PUT buffer_index %d\n", 276 __func__, handle, buffer_index); 277 } 278 op_release(new_op); 279 return ret; 280 } 281 282 int orangefs_revalidate_mapping(struct inode *inode) 283 { 284 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 285 struct address_space *mapping = inode->i_mapping; 286 unsigned long *bitlock = &orangefs_inode->bitlock; 287 int ret; 288 289 while (1) { 290 ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); 291 if (ret) 292 return ret; 293 spin_lock(&inode->i_lock); 294 if (test_bit(1, bitlock)) { 295 spin_unlock(&inode->i_lock); 296 continue; 297 } 298 if (!time_before(jiffies, orangefs_inode->mapping_time)) 299 break; 300 spin_unlock(&inode->i_lock); 301 return 0; 302 } 303 304 set_bit(1, bitlock); 305 smp_wmb(); 306 spin_unlock(&inode->i_lock); 307 308 unmap_mapping_range(mapping, 0, 0, 0); 309 ret = filemap_write_and_wait(mapping); 310 if (!ret) 311 ret = invalidate_inode_pages2(mapping); 312 313 orangefs_inode->mapping_time = jiffies + 314 orangefs_cache_timeout_msecs*HZ/1000; 315 316 clear_bit(1, bitlock); 317 smp_mb__after_atomic(); 318 wake_up_bit(bitlock, 1); 319 320 return ret; 321 } 322 323 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, 324 struct iov_iter *iter) 325 { 326 int ret; 327 orangefs_stats.reads++; 328 329 down_read(&file_inode(iocb->ki_filp)->i_rwsem); 330 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 331 if (ret) 332 goto out; 333 334 ret = generic_file_read_iter(iocb, iter); 335 out: 336 up_read(&file_inode(iocb->ki_filp)->i_rwsem); 337 return ret; 338 } 339 340 static ssize_t orangefs_file_splice_read(struct file *in, loff_t *ppos, 341 struct pipe_inode_info *pipe, 342 size_t len, unsigned int flags) 343 { 344 struct inode *inode = file_inode(in); 345 ssize_t ret; 346 347 orangefs_stats.reads++; 348 349 down_read(&inode->i_rwsem); 350 ret = orangefs_revalidate_mapping(inode); 351 if (ret) 352 goto out; 353 354 ret = filemap_splice_read(in, ppos, pipe, len, flags); 355 out: 356 up_read(&inode->i_rwsem); 357 return ret; 358 } 359 360 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, 361 struct iov_iter *iter) 362 { 363 int ret; 364 orangefs_stats.writes++; 365 366 if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { 367 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 368 if (ret) 369 return ret; 370 } 371 372 ret = generic_file_write_iter(iocb, iter); 373 return ret; 374 } 375 376 static vm_fault_t orangefs_fault(struct vm_fault *vmf) 377 { 378 struct file *file = vmf->vma->vm_file; 379 int ret; 380 ret = orangefs_inode_getattr(file->f_mapping->host, 381 ORANGEFS_GETATTR_SIZE); 382 if (ret == -ESTALE) 383 ret = -EIO; 384 if (ret) { 385 gossip_err("%s: orangefs_inode_getattr failed, " 386 "ret:%d:.\n", __func__, ret); 387 return VM_FAULT_SIGBUS; 388 } 389 return filemap_fault(vmf); 390 } 391 392 static const struct vm_operations_struct orangefs_file_vm_ops = { 393 .fault = orangefs_fault, 394 .map_pages = filemap_map_pages, 395 .page_mkwrite = orangefs_page_mkwrite, 396 }; 397 398 /* 399 * Memory map a region of a file. 400 */ 401 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 402 { 403 int ret; 404 405 ret = orangefs_revalidate_mapping(file_inode(file)); 406 if (ret) 407 return ret; 408 409 gossip_debug(GOSSIP_FILE_DEBUG, 410 "orangefs_file_mmap: called on %pD\n", file); 411 412 /* set the sequential readahead hint */ 413 vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); 414 415 file_accessed(file); 416 vma->vm_ops = &orangefs_file_vm_ops; 417 return 0; 418 } 419 420 #define mapping_nrpages(idata) ((idata)->nrpages) 421 422 /* 423 * Called to notify the module that there are no more references to 424 * this file (i.e. no processes have it open). 425 * 426 * \note Not called when each file is closed. 427 */ 428 static int orangefs_file_release(struct inode *inode, struct file *file) 429 { 430 gossip_debug(GOSSIP_FILE_DEBUG, 431 "orangefs_file_release: called on %pD\n", 432 file); 433 434 /* 435 * remove all associated inode pages from the page cache and 436 * readahead cache (if any); this forces an expensive refresh of 437 * data for the next caller of mmap (or 'get_block' accesses) 438 */ 439 if (mapping_nrpages(file->f_mapping)) { 440 if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { 441 gossip_debug(GOSSIP_INODE_DEBUG, 442 "calling flush_racache on %pU\n", 443 get_khandle_from_ino(inode)); 444 flush_racache(inode); 445 gossip_debug(GOSSIP_INODE_DEBUG, 446 "flush_racache finished\n"); 447 } 448 449 } 450 return 0; 451 } 452 453 /* 454 * Push all data for a specific file onto permanent storage. 455 */ 456 static int orangefs_fsync(struct file *file, 457 loff_t start, 458 loff_t end, 459 int datasync) 460 { 461 int ret; 462 struct orangefs_inode_s *orangefs_inode = 463 ORANGEFS_I(file_inode(file)); 464 struct orangefs_kernel_op_s *new_op = NULL; 465 466 ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, 467 start, end); 468 if (ret < 0) 469 return ret; 470 471 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 472 if (!new_op) 473 return -ENOMEM; 474 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 475 476 ret = service_operation(new_op, 477 "orangefs_fsync", 478 get_interruptible_flag(file_inode(file))); 479 480 gossip_debug(GOSSIP_FILE_DEBUG, 481 "orangefs_fsync got return value of %d\n", 482 ret); 483 484 op_release(new_op); 485 return ret; 486 } 487 488 /* 489 * Change the file pointer position for an instance of an open file. 490 * 491 * \note If .llseek is overriden, we must acquire lock as described in 492 * Documentation/filesystems/locking.rst. 493 * 494 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 495 * require much changes to the FS 496 */ 497 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 498 { 499 int ret = -EINVAL; 500 struct inode *inode = file_inode(file); 501 502 if (origin == SEEK_END) { 503 /* 504 * revalidate the inode's file size. 505 * NOTE: We are only interested in file size here, 506 * so we set mask accordingly. 507 */ 508 ret = orangefs_inode_getattr(file->f_mapping->host, 509 ORANGEFS_GETATTR_SIZE); 510 if (ret == -ESTALE) 511 ret = -EIO; 512 if (ret) { 513 gossip_debug(GOSSIP_FILE_DEBUG, 514 "%s:%s:%d calling make bad inode\n", 515 __FILE__, 516 __func__, 517 __LINE__); 518 return ret; 519 } 520 } 521 522 gossip_debug(GOSSIP_FILE_DEBUG, 523 "orangefs_file_llseek: offset is %ld | origin is %d" 524 " | inode size is %lu\n", 525 (long)offset, 526 origin, 527 (unsigned long)i_size_read(inode)); 528 529 return generic_file_llseek(file, offset, origin); 530 } 531 532 /* 533 * Support local locks (locks that only this kernel knows about) 534 * if Orangefs was mounted -o local_lock. 535 */ 536 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 537 { 538 int rc = -EINVAL; 539 540 if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 541 if (cmd == F_GETLK) { 542 rc = 0; 543 posix_test_lock(filp, fl); 544 } else { 545 rc = posix_lock_file(filp, fl, NULL); 546 } 547 } 548 549 return rc; 550 } 551 552 static int orangefs_flush(struct file *file, fl_owner_t id) 553 { 554 /* 555 * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the 556 * service_operation in orangefs_fsync. 557 * 558 * Do not send fsync to OrangeFS server on a close. Do send fsync 559 * on an explicit fsync call. This duplicates historical OrangeFS 560 * behavior. 561 */ 562 int r; 563 564 r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); 565 if (r > 0) 566 return 0; 567 else 568 return r; 569 } 570 571 /** ORANGEFS implementation of VFS file operations */ 572 const struct file_operations orangefs_file_operations = { 573 .llseek = orangefs_file_llseek, 574 .read_iter = orangefs_file_read_iter, 575 .write_iter = orangefs_file_write_iter, 576 .lock = orangefs_lock, 577 .mmap = orangefs_file_mmap, 578 .open = generic_file_open, 579 .splice_read = orangefs_file_splice_read, 580 .splice_write = iter_file_splice_write, 581 .flush = orangefs_flush, 582 .release = orangefs_file_release, 583 .fsync = orangefs_fsync, 584 }; 585
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.