1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * fs/dax.c - Direct Access filesystem code 2 * fs/dax.c - Direct Access filesystem code 4 * Copyright (c) 2013-2014 Intel Corporation 3 * Copyright (c) 2013-2014 Intel Corporation 5 * Author: Matthew Wilcox <matthew.r.wilcox@in 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 6 * Author: Ross Zwisler <ross.zwisler@linux.in 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> >> 6 * >> 7 * This program is free software; you can redistribute it and/or modify it >> 8 * under the terms and conditions of the GNU General Public License, >> 9 * version 2, as published by the Free Software Foundation. >> 10 * >> 11 * This program is distributed in the hope it will be useful, but WITHOUT >> 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or >> 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for >> 14 * more details. 7 */ 15 */ 8 16 9 #include <linux/atomic.h> 17 #include <linux/atomic.h> 10 #include <linux/blkdev.h> 18 #include <linux/blkdev.h> 11 #include <linux/buffer_head.h> 19 #include <linux/buffer_head.h> 12 #include <linux/dax.h> 20 #include <linux/dax.h> 13 #include <linux/fs.h> 21 #include <linux/fs.h> >> 22 #include <linux/genhd.h> 14 #include <linux/highmem.h> 23 #include <linux/highmem.h> 15 #include <linux/memcontrol.h> 24 #include <linux/memcontrol.h> 16 #include <linux/mm.h> 25 #include <linux/mm.h> 17 #include <linux/mutex.h> 26 #include <linux/mutex.h> 18 #include <linux/pagevec.h> 27 #include <linux/pagevec.h> >> 28 #include <linux/pmem.h> 19 #include <linux/sched.h> 29 #include <linux/sched.h> 20 #include <linux/sched/signal.h> 30 #include <linux/sched/signal.h> 21 #include <linux/uio.h> 31 #include <linux/uio.h> 22 #include <linux/vmstat.h> 32 #include <linux/vmstat.h> 23 #include <linux/pfn_t.h> 33 #include <linux/pfn_t.h> 24 #include <linux/sizes.h> 34 #include <linux/sizes.h> 25 #include <linux/mmu_notifier.h> 35 #include <linux/mmu_notifier.h> 26 #include <linux/iomap.h> 36 #include <linux/iomap.h> 27 #include <linux/rmap.h> !! 37 #include "internal.h" 28 #include <asm/pgalloc.h> << 29 38 30 #define CREATE_TRACE_POINTS 39 #define CREATE_TRACE_POINTS 31 #include <trace/events/fs_dax.h> 40 #include <trace/events/fs_dax.h> 32 41 33 /* We choose 4096 entries - same as per-zone p 42 /* We choose 4096 entries - same as per-zone page wait tables */ 34 #define DAX_WAIT_TABLE_BITS 12 43 #define DAX_WAIT_TABLE_BITS 12 35 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_ 44 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 36 45 37 /* The 'colour' (ie low bits) within a PMD of << 38 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHI << 39 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIF << 40 << 41 static wait_queue_head_t wait_table[DAX_WAIT_T 46 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 42 47 43 static int __init init_dax_wait_table(void) 48 static int __init init_dax_wait_table(void) 44 { 49 { 45 int i; 50 int i; 46 51 47 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES 52 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 48 init_waitqueue_head(wait_table 53 init_waitqueue_head(wait_table + i); 49 return 0; 54 return 0; 50 } 55 } 51 fs_initcall(init_dax_wait_table); 56 fs_initcall(init_dax_wait_table); 52 57 53 /* !! 58 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 54 * DAX pagecache entries use XArray value entr << 55 * for pages. We use one bit for locking, one << 56 * and two more to tell us if the entry is a z << 57 * is just used for locking. In total four sp << 58 * << 59 * If the PMD bit isn't set the entry has size << 60 * and EMPTY bits aren't set the entry is a no << 61 * block allocation. << 62 */ << 63 #define DAX_SHIFT (4) << 64 #define DAX_LOCKED (1UL << 0) << 65 #define DAX_PMD (1UL << 1) << 66 #define DAX_ZERO_PAGE (1UL << 2) << 67 #define DAX_EMPTY (1UL << 3) << 68 << 69 static unsigned long dax_to_pfn(void *entry) << 70 { 59 { 71 return xa_to_value(entry) >> DAX_SHIFT !! 60 struct request_queue *q = bdev->bd_queue; 72 } !! 61 long rc = -EIO; 73 62 74 static void *dax_make_entry(pfn_t pfn, unsigne !! 63 dax->addr = ERR_PTR(-EIO); 75 { !! 64 if (blk_queue_enter(q, true) != 0) 76 return xa_mk_value(flags | (pfn_t_to_p !! 65 return rc; 77 } << 78 66 79 static bool dax_is_locked(void *entry) !! 67 rc = bdev_direct_access(bdev, dax); 80 { !! 68 if (rc < 0) { 81 return xa_to_value(entry) & DAX_LOCKED !! 69 dax->addr = ERR_PTR(rc); >> 70 blk_queue_exit(q); >> 71 return rc; >> 72 } >> 73 return rc; 82 } 74 } 83 75 84 static unsigned int dax_entry_order(void *entr !! 76 static void dax_unmap_atomic(struct block_device *bdev, >> 77 const struct blk_dax_ctl *dax) 85 { 78 { 86 if (xa_to_value(entry) & DAX_PMD) !! 79 if (IS_ERR(dax->addr)) 87 return PMD_ORDER; !! 80 return; 88 return 0; !! 81 blk_queue_exit(bdev->bd_queue); 89 } 82 } 90 83 91 static unsigned long dax_is_pmd_entry(void *en !! 84 static int dax_is_pmd_entry(void *entry) 92 { 85 { 93 return xa_to_value(entry) & DAX_PMD; !! 86 return (unsigned long)entry & RADIX_DAX_PMD; 94 } 87 } 95 88 96 static bool dax_is_pte_entry(void *entry) !! 89 static int dax_is_pte_entry(void *entry) 97 { 90 { 98 return !(xa_to_value(entry) & DAX_PMD) !! 91 return !((unsigned long)entry & RADIX_DAX_PMD); 99 } 92 } 100 93 101 static int dax_is_zero_entry(void *entry) 94 static int dax_is_zero_entry(void *entry) 102 { 95 { 103 return xa_to_value(entry) & DAX_ZERO_P !! 96 return (unsigned long)entry & RADIX_DAX_HZP; 104 } 97 } 105 98 106 static int dax_is_empty_entry(void *entry) 99 static int dax_is_empty_entry(void *entry) 107 { 100 { 108 return xa_to_value(entry) & DAX_EMPTY; !! 101 return (unsigned long)entry & RADIX_DAX_EMPTY; 109 } 102 } 110 103 111 /* !! 104 struct page *read_dax_sector(struct block_device *bdev, sector_t n) 112 * true if the entry that was found is of a sm << 113 * we were looking for << 114 */ << 115 static bool dax_is_conflict(void *entry) << 116 { 105 { 117 return entry == XA_RETRY_ENTRY; !! 106 struct page *page = alloc_pages(GFP_KERNEL, 0); >> 107 struct blk_dax_ctl dax = { >> 108 .size = PAGE_SIZE, >> 109 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), >> 110 }; >> 111 long rc; >> 112 >> 113 if (!page) >> 114 return ERR_PTR(-ENOMEM); >> 115 >> 116 rc = dax_map_atomic(bdev, &dax); >> 117 if (rc < 0) >> 118 return ERR_PTR(rc); >> 119 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); >> 120 dax_unmap_atomic(bdev, &dax); >> 121 return page; 118 } 122 } 119 123 120 /* 124 /* 121 * DAX page cache entry locking !! 125 * DAX radix tree locking 122 */ 126 */ 123 struct exceptional_entry_key { 127 struct exceptional_entry_key { 124 struct xarray *xa; !! 128 struct address_space *mapping; 125 pgoff_t entry_start; 129 pgoff_t entry_start; 126 }; 130 }; 127 131 128 struct wait_exceptional_entry_queue { 132 struct wait_exceptional_entry_queue { 129 wait_queue_entry_t wait; !! 133 wait_queue_t wait; 130 struct exceptional_entry_key key; 134 struct exceptional_entry_key key; 131 }; 135 }; 132 136 133 /** !! 137 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 134 * enum dax_wake_mode: waitqueue wakeup behavi !! 138 pgoff_t index, void *entry, struct exceptional_entry_key *key) 135 * @WAKE_ALL: wake all waiters in the waitqueu << 136 * @WAKE_NEXT: wake only the first waiter in t << 137 */ << 138 enum dax_wake_mode { << 139 WAKE_ALL, << 140 WAKE_NEXT, << 141 }; << 142 << 143 static wait_queue_head_t *dax_entry_waitqueue( << 144 void *entry, struct exceptiona << 145 { 139 { 146 unsigned long hash; 140 unsigned long hash; 147 unsigned long index = xas->xa_index; << 148 141 149 /* 142 /* 150 * If 'entry' is a PMD, align the 'ind 143 * If 'entry' is a PMD, align the 'index' that we use for the wait 151 * queue to the start of that PMD. Th 144 * queue to the start of that PMD. This ensures that all offsets in 152 * the range covered by the PMD map to 145 * the range covered by the PMD map to the same bit lock. 153 */ 146 */ 154 if (dax_is_pmd_entry(entry)) 147 if (dax_is_pmd_entry(entry)) 155 index &= ~PG_PMD_COLOUR; !! 148 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 156 key->xa = xas->xa; !! 149 >> 150 key->mapping = mapping; 157 key->entry_start = index; 151 key->entry_start = index; 158 152 159 hash = hash_long((unsigned long)xas->x !! 153 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 160 return wait_table + hash; 154 return wait_table + hash; 161 } 155 } 162 156 163 static int wake_exceptional_entry_func(wait_qu !! 157 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 164 unsigned int mode, int sync, v !! 158 int sync, void *keyp) 165 { 159 { 166 struct exceptional_entry_key *key = ke 160 struct exceptional_entry_key *key = keyp; 167 struct wait_exceptional_entry_queue *e 161 struct wait_exceptional_entry_queue *ewait = 168 container_of(wait, struct wait 162 container_of(wait, struct wait_exceptional_entry_queue, wait); 169 163 170 if (key->xa != ewait->key.xa || !! 164 if (key->mapping != ewait->key.mapping || 171 key->entry_start != ewait->key.ent 165 key->entry_start != ewait->key.entry_start) 172 return 0; 166 return 0; 173 return autoremove_wake_function(wait, 167 return autoremove_wake_function(wait, mode, sync, NULL); 174 } 168 } 175 169 176 /* 170 /* 177 * @entry may no longer be the entry at the in !! 171 * Check whether the given slot is locked. The function must be called with 178 * The important information it's conveying is !! 172 * mapping->tree_lock held 179 * this index used to be a PMD entry. << 180 */ 173 */ 181 static void dax_wake_entry(struct xa_state *xa !! 174 static inline int slot_locked(struct address_space *mapping, void **slot) 182 enum dax_wake_mode << 183 { 175 { 184 struct exceptional_entry_key key; !! 176 unsigned long entry = (unsigned long) 185 wait_queue_head_t *wq; !! 177 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); >> 178 return entry & RADIX_DAX_ENTRY_LOCK; >> 179 } 186 180 187 wq = dax_entry_waitqueue(xas, entry, & !! 181 /* >> 182 * Mark the given slot is locked. The function must be called with >> 183 * mapping->tree_lock held >> 184 */ >> 185 static inline void *lock_slot(struct address_space *mapping, void **slot) >> 186 { >> 187 unsigned long entry = (unsigned long) >> 188 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 188 189 189 /* !! 190 entry |= RADIX_DAX_ENTRY_LOCK; 190 * Checking for locked entry and prepa !! 191 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 191 * under the i_pages lock, ditto for e !! 192 return (void *)entry; 192 * So at this point all tasks that cou << 193 * must be in the waitqueue and the fo << 194 */ << 195 if (waitqueue_active(wq)) << 196 __wake_up(wq, TASK_NORMAL, mod << 197 } 193 } 198 194 199 /* 195 /* 200 * Look up entry in page cache, wait for it to !! 196 * Mark the given slot is unlocked. The function must be called with 201 * is a DAX entry and return it. The caller m !! 197 * mapping->tree_lock held 202 * put_unlocked_entry() if it did not lock the !! 198 */ 203 * if it did. The entry returned may have a l !! 199 static inline void *unlock_slot(struct address_space *mapping, void **slot) 204 * If @order is larger than the order of the e !! 200 { 205 * function returns a dax_is_conflict entry. !! 201 unsigned long entry = (unsigned long) >> 202 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); >> 203 >> 204 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; >> 205 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); >> 206 return (void *)entry; >> 207 } >> 208 >> 209 /* >> 210 * Lookup entry in radix tree, wait for it to become unlocked if it is >> 211 * exceptional entry and return it. The caller must call >> 212 * put_unlocked_mapping_entry() when he decided not to lock the entry or >> 213 * put_locked_mapping_entry() when he locked the entry and now wants to >> 214 * unlock it. 206 * 215 * 207 * Must be called with the i_pages lock held. !! 216 * The function must be called with mapping->tree_lock held. 208 */ 217 */ 209 static void *get_unlocked_entry(struct xa_stat !! 218 static void *get_unlocked_mapping_entry(struct address_space *mapping, >> 219 pgoff_t index, void ***slotp) 210 { 220 { 211 void *entry; !! 221 void *entry, **slot; 212 struct wait_exceptional_entry_queue ew 222 struct wait_exceptional_entry_queue ewait; 213 wait_queue_head_t *wq; 223 wait_queue_head_t *wq; 214 224 215 init_wait(&ewait.wait); 225 init_wait(&ewait.wait); 216 ewait.wait.func = wake_exceptional_ent 226 ewait.wait.func = wake_exceptional_entry_func; 217 227 218 for (;;) { 228 for (;;) { 219 entry = xas_find_conflict(xas) !! 229 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 220 if (!entry || WARN_ON_ONCE(!xa !! 230 &slot); 221 return entry; !! 231 if (!entry || !radix_tree_exceptional_entry(entry) || 222 if (dax_entry_order(entry) < o !! 232 !slot_locked(mapping, slot)) { 223 return XA_RETRY_ENTRY; !! 233 if (slotp) 224 if (!dax_is_locked(entry)) !! 234 *slotp = slot; 225 return entry; 235 return entry; >> 236 } 226 237 227 wq = dax_entry_waitqueue(xas, !! 238 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 228 prepare_to_wait_exclusive(wq, 239 prepare_to_wait_exclusive(wq, &ewait.wait, 229 TASK 240 TASK_UNINTERRUPTIBLE); 230 xas_unlock_irq(xas); !! 241 spin_unlock_irq(&mapping->tree_lock); 231 xas_reset(xas); << 232 schedule(); 242 schedule(); 233 finish_wait(wq, &ewait.wait); 243 finish_wait(wq, &ewait.wait); 234 xas_lock_irq(xas); !! 244 spin_lock_irq(&mapping->tree_lock); 235 } << 236 } << 237 << 238 /* << 239 * The only thing keeping the address space ar << 240 * (it's cycled in clear_inode() after removin << 241 * After we call xas_unlock_irq(), we cannot t << 242 */ << 243 static void wait_entry_unlocked(struct xa_stat << 244 { << 245 struct wait_exceptional_entry_queue ew << 246 wait_queue_head_t *wq; << 247 << 248 init_wait(&ewait.wait); << 249 ewait.wait.func = wake_exceptional_ent << 250 << 251 wq = dax_entry_waitqueue(xas, entry, & << 252 /* << 253 * Unlike get_unlocked_entry() there i << 254 * path ever successfully retrieves an << 255 * inode dies. Perform a non-exclusive << 256 * never successfully performs its own << 257 */ << 258 prepare_to_wait(wq, &ewait.wait, TASK_ << 259 xas_unlock_irq(xas); << 260 schedule(); << 261 finish_wait(wq, &ewait.wait); << 262 } << 263 << 264 static void put_unlocked_entry(struct xa_state << 265 enum dax_wake_m << 266 { << 267 if (entry && !dax_is_conflict(entry)) << 268 dax_wake_entry(xas, entry, mod << 269 } << 270 << 271 /* << 272 * We used the xa_state to get the entry, but << 273 * dropped the xa_lock, so we know the xa_stat << 274 * before use. << 275 */ << 276 static void dax_unlock_entry(struct xa_state * << 277 { << 278 void *old; << 279 << 280 BUG_ON(dax_is_locked(entry)); << 281 xas_reset(xas); << 282 xas_lock_irq(xas); << 283 old = xas_store(xas, entry); << 284 xas_unlock_irq(xas); << 285 BUG_ON(!dax_is_locked(old)); << 286 dax_wake_entry(xas, entry, WAKE_NEXT); << 287 } << 288 << 289 /* << 290 * Return: The entry stored at this location b << 291 */ << 292 static void *dax_lock_entry(struct xa_state *x << 293 { << 294 unsigned long v = xa_to_value(entry); << 295 return xas_store(xas, xa_mk_value(v | << 296 } << 297 << 298 static unsigned long dax_entry_size(void *entr << 299 { << 300 if (dax_is_zero_entry(entry)) << 301 return 0; << 302 else if (dax_is_empty_entry(entry)) << 303 return 0; << 304 else if (dax_is_pmd_entry(entry)) << 305 return PMD_SIZE; << 306 else << 307 return PAGE_SIZE; << 308 } << 309 << 310 static unsigned long dax_end_pfn(void *entry) << 311 { << 312 return dax_to_pfn(entry) + dax_entry_s << 313 } << 314 << 315 /* << 316 * Iterate through all mapped pfns represented << 317 * 'empty' and 'zero' entries. << 318 */ << 319 #define for_each_mapped_pfn(entry, pfn) \ << 320 for (pfn = dax_to_pfn(entry); \ << 321 pfn < dax_end_pfn(entr << 322 << 323 static inline bool dax_page_is_shared(struct p << 324 { << 325 return page->mapping == PAGE_MAPPING_D << 326 } << 327 << 328 /* << 329 * Set the page->mapping with PAGE_MAPPING_DAX << 330 * refcount. << 331 */ << 332 static inline void dax_page_share_get(struct p << 333 { << 334 if (page->mapping != PAGE_MAPPING_DAX_ << 335 /* << 336 * Reset the index if the page << 337 * regularly before. << 338 */ << 339 if (page->mapping) << 340 page->share = 1; << 341 page->mapping = PAGE_MAPPING_D << 342 } 245 } 343 page->share++; << 344 } 246 } 345 247 346 static inline unsigned long dax_page_share_put !! 248 static void dax_unlock_mapping_entry(struct address_space *mapping, >> 249 pgoff_t index) 347 { 250 { 348 return --page->share; !! 251 void *entry, **slot; 349 } << 350 252 351 /* !! 253 spin_lock_irq(&mapping->tree_lock); 352 * When it is called in dax_insert_entry(), th !! 254 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 353 * whether this entry is shared by multiple fi !! 255 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 354 * PAGE_MAPPING_DAX_SHARED, and use page->shar !! 256 !slot_locked(mapping, slot))) { 355 */ !! 257 spin_unlock_irq(&mapping->tree_lock); 356 static void dax_associate_entry(void *entry, s << 357 struct vm_area_struct *vma, un << 358 { << 359 unsigned long size = dax_entry_size(en << 360 int i = 0; << 361 << 362 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) << 363 return; << 364 << 365 index = linear_page_index(vma, address << 366 for_each_mapped_pfn(entry, pfn) { << 367 struct page *page = pfn_to_pag << 368 << 369 if (shared) { << 370 dax_page_share_get(pag << 371 } else { << 372 WARN_ON_ONCE(page->map << 373 page->mapping = mappin << 374 page->index = index + << 375 } << 376 } << 377 } << 378 << 379 static void dax_disassociate_entry(void *entry << 380 bool trunc) << 381 { << 382 unsigned long pfn; << 383 << 384 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) << 385 return; 258 return; 386 << 387 for_each_mapped_pfn(entry, pfn) { << 388 struct page *page = pfn_to_pag << 389 << 390 WARN_ON_ONCE(trunc && page_ref << 391 if (dax_page_is_shared(page)) << 392 /* keep the shared fla << 393 if (dax_page_share_put << 394 continue; << 395 } else << 396 WARN_ON_ONCE(page->map << 397 page->mapping = NULL; << 398 page->index = 0; << 399 } << 400 } << 401 << 402 static struct page *dax_busy_page(void *entry) << 403 { << 404 unsigned long pfn; << 405 << 406 for_each_mapped_pfn(entry, pfn) { << 407 struct page *page = pfn_to_pag << 408 << 409 if (page_ref_count(page) > 1) << 410 return page; << 411 } 259 } 412 return NULL; !! 260 unlock_slot(mapping, slot); >> 261 spin_unlock_irq(&mapping->tree_lock); >> 262 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 413 } 263 } 414 264 415 /** !! 265 static void put_locked_mapping_entry(struct address_space *mapping, 416 * dax_lock_folio - Lock the DAX entry corresp !! 266 pgoff_t index, void *entry) 417 * @folio: The folio whose entry we want to lo << 418 * << 419 * Context: Process context. << 420 * Return: A cookie to pass to dax_unlock_foli << 421 * not be locked. << 422 */ << 423 dax_entry_t dax_lock_folio(struct folio *folio << 424 { 267 { 425 XA_STATE(xas, NULL, 0); !! 268 if (!radix_tree_exceptional_entry(entry)) { 426 void *entry; !! 269 unlock_page(entry); 427 !! 270 put_page(entry); 428 /* Ensure folio->mapping isn't freed w !! 271 } else { 429 rcu_read_lock(); !! 272 dax_unlock_mapping_entry(mapping, index); 430 for (;;) { << 431 struct address_space *mapping << 432 << 433 entry = NULL; << 434 if (!mapping || !dax_mapping(m << 435 break; << 436 << 437 /* << 438 * In the device-dax case ther << 439 * struct dev_pagemap pin is s << 440 * inode alive, and we assume << 441 * otherwise we would not have << 442 * translation. << 443 */ << 444 entry = (void *)~0UL; << 445 if (S_ISCHR(mapping->host->i_m << 446 break; << 447 << 448 xas.xa = &mapping->i_pages; << 449 xas_lock_irq(&xas); << 450 if (mapping != folio->mapping) << 451 xas_unlock_irq(&xas); << 452 continue; << 453 } << 454 xas_set(&xas, folio->index); << 455 entry = xas_load(&xas); << 456 if (dax_is_locked(entry)) { << 457 rcu_read_unlock(); << 458 wait_entry_unlocked(&x << 459 rcu_read_lock(); << 460 continue; << 461 } << 462 dax_lock_entry(&xas, entry); << 463 xas_unlock_irq(&xas); << 464 break; << 465 } 273 } 466 rcu_read_unlock(); << 467 return (dax_entry_t)entry; << 468 } << 469 << 470 void dax_unlock_folio(struct folio *folio, dax << 471 { << 472 struct address_space *mapping = folio- << 473 XA_STATE(xas, &mapping->i_pages, folio << 474 << 475 if (S_ISCHR(mapping->host->i_mode)) << 476 return; << 477 << 478 dax_unlock_entry(&xas, (void *)cookie) << 479 } 274 } 480 275 481 /* 276 /* 482 * dax_lock_mapping_entry - Lock the DAX entry !! 277 * Called when we are done with radix tree entry we looked up via 483 * @mapping: the file's mapping whose entry we !! 278 * get_unlocked_mapping_entry() and which we didn't lock in the end. 484 * @index: the offset within this file << 485 * @page: output the dax page corresponding to << 486 * << 487 * Return: A cookie to pass to dax_unlock_mapp << 488 * could not be locked. << 489 */ 279 */ 490 dax_entry_t dax_lock_mapping_entry(struct addr !! 280 static void put_unlocked_mapping_entry(struct address_space *mapping, 491 struct page **page) !! 281 pgoff_t index, void *entry) 492 { 282 { 493 XA_STATE(xas, NULL, 0); !! 283 if (!radix_tree_exceptional_entry(entry)) 494 void *entry; << 495 << 496 rcu_read_lock(); << 497 for (;;) { << 498 entry = NULL; << 499 if (!dax_mapping(mapping)) << 500 break; << 501 << 502 xas.xa = &mapping->i_pages; << 503 xas_lock_irq(&xas); << 504 xas_set(&xas, index); << 505 entry = xas_load(&xas); << 506 if (dax_is_locked(entry)) { << 507 rcu_read_unlock(); << 508 wait_entry_unlocked(&x << 509 rcu_read_lock(); << 510 continue; << 511 } << 512 if (!entry || << 513 dax_is_zero_entry(entry) | << 514 /* << 515 * Because we are look << 516 * and index, so the e << 517 * or even a zero/empt << 518 * an error case. So, << 519 * not output @page. << 520 */ << 521 entry = (void *)~0UL; << 522 } else { << 523 *page = pfn_to_page(da << 524 dax_lock_entry(&xas, e << 525 } << 526 xas_unlock_irq(&xas); << 527 break; << 528 } << 529 rcu_read_unlock(); << 530 return (dax_entry_t)entry; << 531 } << 532 << 533 void dax_unlock_mapping_entry(struct address_s << 534 dax_entry_t cookie) << 535 { << 536 XA_STATE(xas, &mapping->i_pages, index << 537 << 538 if (cookie == ~0UL) << 539 return; 284 return; 540 285 541 dax_unlock_entry(&xas, (void *)cookie) !! 286 /* We have to wake up next waiter for the radix tree entry lock */ >> 287 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 542 } 288 } 543 289 544 /* 290 /* 545 * Find page cache entry at given index. If it !! 291 * Find radix tree entry at given index. If it points to a page, return with 546 * with the entry locked. If the page cache do !! 292 * the page locked. If it points to the exceptional entry, return with the 547 * that index, add a locked empty entry. !! 293 * radix tree entry locked. If the radix tree doesn't contain given index, >> 294 * create empty exceptional entry for the index and return with it locked. 548 * 295 * 549 * When requesting an entry with size DAX_PMD, !! 296 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 550 * either return that locked entry or will ret !! 297 * either return that locked entry or will return an error. This error will 551 * This will happen if there are any PTE entri !! 298 * happen if there are any 4k entries (either zero pages or DAX entries) 552 * that we are requesting. !! 299 * within the 2MiB range that we are requesting. 553 * 300 * 554 * We always favor PTE entries over PMD entrie !! 301 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 555 * evict PTE entries in order to 'upgrade' the !! 302 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 556 * insertion will fail if it finds any PTE ent !! 303 * insertion will fail if it finds any 4k entries already in the tree, and a 557 * PTE insertion will cause an existing PMD en !! 304 * 4k insertion will cause an existing 2MiB entry to be unmapped and 558 * downgraded to PTE entries. This happens fo !! 305 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 559 * well as PMD empty entries. !! 306 * well as 2MiB empty entries. 560 * 307 * 561 * The exception to this downgrade path is for !! 308 * The exception to this downgrade path is for 2MiB DAX PMD entries that have 562 * real storage backing them. We will leave t !! 309 * real storage backing them. We will leave these real 2MiB DAX entries in 563 * the tree, and PTE writes will simply dirty !! 310 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 564 * 311 * 565 * Note: Unlike filemap_fault() we don't honor 312 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 566 * persistent memory the benefit is doubtful. 313 * persistent memory the benefit is doubtful. We can add that later if we can 567 * show it helps. 314 * show it helps. 568 * << 569 * On error, this function does not return an << 570 * a VM_FAULT code, encoded as an xarray inter << 571 * overlap with xarray value entries. << 572 */ 315 */ 573 static void *grab_mapping_entry(struct xa_stat !! 316 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 574 struct address_space *mapping, !! 317 unsigned long size_flag) 575 { 318 { 576 unsigned long index = xas->xa_index; !! 319 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 577 bool pmd_downgrade; /* splitting P !! 320 void *entry, **slot; 578 void *entry; << 579 321 580 retry: !! 322 restart: 581 pmd_downgrade = false; !! 323 spin_lock_irq(&mapping->tree_lock); 582 xas_lock_irq(xas); !! 324 entry = get_unlocked_mapping_entry(mapping, index, &slot); 583 entry = get_unlocked_entry(xas, order) << 584 325 585 if (entry) { 326 if (entry) { 586 if (dax_is_conflict(entry)) !! 327 if (size_flag & RADIX_DAX_PMD) { 587 goto fallback; !! 328 if (!radix_tree_exceptional_entry(entry) || 588 if (!xa_is_value(entry)) { !! 329 dax_is_pte_entry(entry)) { 589 xas_set_err(xas, -EIO) !! 330 put_unlocked_mapping_entry(mapping, index, 590 goto out_unlock; !! 331 entry); 591 } !! 332 entry = ERR_PTR(-EEXIST); 592 !! 333 goto out_unlock; 593 if (order == 0) { !! 334 } 594 if (dax_is_pmd_entry(e !! 335 } else { /* trying to grab a PTE entry */ >> 336 if (radix_tree_exceptional_entry(entry) && >> 337 dax_is_pmd_entry(entry) && 595 (dax_is_zero_entry 338 (dax_is_zero_entry(entry) || 596 dax_is_empty_entr 339 dax_is_empty_entry(entry))) { 597 pmd_downgrade 340 pmd_downgrade = true; 598 } 341 } 599 } 342 } 600 } 343 } 601 344 602 if (pmd_downgrade) { !! 345 /* No entry for given index? Make sure radix tree is big enough. */ 603 /* !! 346 if (!entry || pmd_downgrade) { 604 * Make sure 'entry' remains v !! 347 int err; 605 * the i_pages lock. << 606 */ << 607 dax_lock_entry(xas, entry); << 608 348 >> 349 if (pmd_downgrade) { >> 350 /* >> 351 * Make sure 'entry' remains valid while we drop >> 352 * mapping->tree_lock. >> 353 */ >> 354 entry = lock_slot(mapping, slot); >> 355 } >> 356 >> 357 spin_unlock_irq(&mapping->tree_lock); 609 /* 358 /* 610 * Besides huge zero pages the 359 * Besides huge zero pages the only other thing that gets 611 * downgraded are empty entrie 360 * downgraded are empty entries which don't need to be 612 * unmapped. 361 * unmapped. 613 */ 362 */ 614 if (dax_is_zero_entry(entry)) !! 363 if (pmd_downgrade && dax_is_zero_entry(entry)) 615 xas_unlock_irq(xas); !! 364 unmap_mapping_range(mapping, 616 unmap_mapping_pages(ma !! 365 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 617 xas->x !! 366 618 PG_PMD !! 367 err = radix_tree_preload( 619 xas_reset(xas); !! 368 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 620 xas_lock_irq(xas); !! 369 if (err) { >> 370 if (pmd_downgrade) >> 371 put_locked_mapping_entry(mapping, index, entry); >> 372 return ERR_PTR(err); 621 } 373 } >> 374 spin_lock_irq(&mapping->tree_lock); 622 375 623 dax_disassociate_entry(entry, !! 376 if (!entry) { 624 xas_store(xas, NULL); /* und !! 377 /* 625 dax_wake_entry(xas, entry, WAK !! 378 * We needed to drop the page_tree lock while calling 626 mapping->nrpages -= PG_PMD_NR; !! 379 * radix_tree_preload() and we didn't have an entry to 627 entry = NULL; !! 380 * lock. See if another thread inserted an entry at 628 xas_set(xas, index); !! 381 * our index during this time. 629 } !! 382 */ >> 383 entry = __radix_tree_lookup(&mapping->page_tree, index, >> 384 NULL, &slot); >> 385 if (entry) { >> 386 radix_tree_preload_end(); >> 387 spin_unlock_irq(&mapping->tree_lock); >> 388 goto restart; >> 389 } >> 390 } 630 391 631 if (entry) { !! 392 if (pmd_downgrade) { 632 dax_lock_entry(xas, entry); !! 393 radix_tree_delete(&mapping->page_tree, index); 633 } else { !! 394 mapping->nrexceptional--; 634 unsigned long flags = DAX_EMPT !! 395 dax_wake_mapping_entry_waiter(mapping, index, entry, >> 396 true); >> 397 } >> 398 >> 399 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 635 400 636 if (order > 0) !! 401 err = __radix_tree_insert(&mapping->page_tree, index, 637 flags |= DAX_PMD; !! 402 dax_radix_order(entry), entry); 638 entry = dax_make_entry(pfn_to_ !! 403 radix_tree_preload_end(); 639 dax_lock_entry(xas, entry); !! 404 if (err) { 640 if (xas_error(xas)) !! 405 spin_unlock_irq(&mapping->tree_lock); 641 goto out_unlock; !! 406 /* 642 mapping->nrpages += 1UL << ord !! 407 * Our insertion of a DAX entry failed, most likely 643 } !! 408 * because we were inserting a PMD entry and it 644 !! 409 * collided with a PTE sized entry at a different 645 out_unlock: !! 410 * index in the PMD range. We haven't inserted 646 xas_unlock_irq(xas); !! 411 * anything into the radix tree and have no waiters to 647 if (xas_nomem(xas, mapping_gfp_mask(ma !! 412 * wake. 648 goto retry; !! 413 */ 649 if (xas->xa_node == XA_ERROR(-ENOMEM)) !! 414 return ERR_PTR(err); 650 return xa_mk_internal(VM_FAULT !! 415 } 651 if (xas_error(xas)) !! 416 /* Good, we have inserted empty locked entry into the tree. */ 652 return xa_mk_internal(VM_FAULT !! 417 mapping->nrexceptional++; >> 418 spin_unlock_irq(&mapping->tree_lock); >> 419 return entry; >> 420 } >> 421 /* Normal page in radix tree? */ >> 422 if (!radix_tree_exceptional_entry(entry)) { >> 423 struct page *page = entry; >> 424 >> 425 get_page(page); >> 426 spin_unlock_irq(&mapping->tree_lock); >> 427 lock_page(page); >> 428 /* Page got truncated? Retry... */ >> 429 if (unlikely(page->mapping != mapping)) { >> 430 unlock_page(page); >> 431 put_page(page); >> 432 goto restart; >> 433 } >> 434 return page; >> 435 } >> 436 entry = lock_slot(mapping, slot); >> 437 out_unlock: >> 438 spin_unlock_irq(&mapping->tree_lock); 653 return entry; 439 return entry; 654 fallback: << 655 xas_unlock_irq(xas); << 656 return xa_mk_internal(VM_FAULT_FALLBAC << 657 } 440 } 658 441 659 /** !! 442 /* 660 * dax_layout_busy_page_range - find first pin !! 443 * We do not necessarily hold the mapping->tree_lock when we call this 661 * @mapping: address space to scan for a page !! 444 * function so it is possible that 'entry' is no longer a valid item in the 662 * @start: Starting offset. Page containing 's !! 445 * radix tree. This is okay because all we really need to do is to find the 663 * @end: End offset. Page containing 'end' is !! 446 * correct waitqueue where tasks might be waiting for that old 'entry' and 664 * pages from 'start' till the end of fi !! 447 * wake them. 665 * << 666 * DAX requires ZONE_DEVICE mapped pages. Thes << 667 * 'onlined' to the page allocator so they are << 668 * page->count == 1. A filesystem uses this in << 669 * any page in the mapping is busy, i.e. for D << 670 * get_user_pages() usages. << 671 * << 672 * It is expected that the filesystem is holdi << 673 * establishment of new mappings in this addre << 674 * to be able to run unmap_mapping_range() and << 675 * mapping_mapped() becoming true. << 676 */ 448 */ 677 struct page *dax_layout_busy_page_range(struct !! 449 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 678 loff_t !! 450 pgoff_t index, void *entry, bool wake_all) 679 { 451 { 680 void *entry; !! 452 struct exceptional_entry_key key; 681 unsigned int scanned = 0; !! 453 wait_queue_head_t *wq; 682 struct page *page = NULL; << 683 pgoff_t start_idx = start >> PAGE_SHIF << 684 pgoff_t end_idx; << 685 XA_STATE(xas, &mapping->i_pages, start << 686 << 687 /* << 688 * In the 'limited' case get_user_page << 689 */ << 690 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) << 691 return NULL; << 692 454 693 if (!dax_mapping(mapping) || !mapping_ !! 455 wq = dax_entry_waitqueue(mapping, index, entry, &key); 694 return NULL; << 695 456 696 /* If end == LLONG_MAX, all pages from << 697 if (end == LLONG_MAX) << 698 end_idx = ULONG_MAX; << 699 else << 700 end_idx = end >> PAGE_SHIFT; << 701 /* 457 /* 702 * If we race get_user_pages_fast() he !! 458 * Checking for locked entry and prepare_to_wait_exclusive() happens 703 * elevated page count in the iteratio !! 459 * under mapping->tree_lock, ditto for entry handling in our callers. 704 * get_user_pages_fast() will see that !! 460 * So at this point all tasks that could have seen our entry locked 705 * against is no longer mapped in the !! 461 * must be in the waitqueue and the following check will see them. 706 * get_user_pages() slow path. The sl << 707 * pte_lock() and pmd_lock(). New refe << 708 * holding those locks, and unmap_mapp << 709 * pte or pmd without holding the resp << 710 * guaranteed to either see new refere << 711 * references from being established. << 712 */ 462 */ 713 unmap_mapping_pages(mapping, start_idx !! 463 if (waitqueue_active(wq)) 714 !! 464 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 715 xas_lock_irq(&xas); << 716 xas_for_each(&xas, entry, end_idx) { << 717 if (WARN_ON_ONCE(!xa_is_value( << 718 continue; << 719 if (unlikely(dax_is_locked(ent << 720 entry = get_unlocked_e << 721 if (entry) << 722 page = dax_busy_page(e << 723 put_unlocked_entry(&xas, entry << 724 if (page) << 725 break; << 726 if (++scanned % XA_CHECK_SCHED << 727 continue; << 728 << 729 xas_pause(&xas); << 730 xas_unlock_irq(&xas); << 731 cond_resched(); << 732 xas_lock_irq(&xas); << 733 } << 734 xas_unlock_irq(&xas); << 735 return page; << 736 } << 737 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); << 738 << 739 struct page *dax_layout_busy_page(struct addre << 740 { << 741 return dax_layout_busy_page_range(mapp << 742 } 465 } 743 EXPORT_SYMBOL_GPL(dax_layout_busy_page); << 744 466 745 static int __dax_invalidate_entry(struct addre !! 467 static int __dax_invalidate_mapping_entry(struct address_space *mapping, 746 pgof 468 pgoff_t index, bool trunc) 747 { 469 { 748 XA_STATE(xas, &mapping->i_pages, index << 749 int ret = 0; 470 int ret = 0; 750 void *entry; 471 void *entry; >> 472 struct radix_tree_root *page_tree = &mapping->page_tree; 751 473 752 xas_lock_irq(&xas); !! 474 spin_lock_irq(&mapping->tree_lock); 753 entry = get_unlocked_entry(&xas, 0); !! 475 entry = get_unlocked_mapping_entry(mapping, index, NULL); 754 if (!entry || WARN_ON_ONCE(!xa_is_valu !! 476 if (!entry || !radix_tree_exceptional_entry(entry)) 755 goto out; 477 goto out; 756 if (!trunc && 478 if (!trunc && 757 (xas_get_mark(&xas, PAGECACHE_TAG_ !! 479 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 758 xas_get_mark(&xas, PAGECACHE_TAG_ !! 480 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) 759 goto out; 481 goto out; 760 dax_disassociate_entry(entry, mapping, !! 482 radix_tree_delete(page_tree, index); 761 xas_store(&xas, NULL); !! 483 mapping->nrexceptional--; 762 mapping->nrpages -= 1UL << dax_entry_o << 763 ret = 1; 484 ret = 1; 764 out: 485 out: 765 put_unlocked_entry(&xas, entry, WAKE_A !! 486 put_unlocked_mapping_entry(mapping, index, entry); 766 xas_unlock_irq(&xas); !! 487 spin_unlock_irq(&mapping->tree_lock); 767 return ret; 488 return ret; 768 } 489 } 769 << 770 static int __dax_clear_dirty_range(struct addr << 771 pgoff_t start, pgoff_t end) << 772 { << 773 XA_STATE(xas, &mapping->i_pages, start << 774 unsigned int scanned = 0; << 775 void *entry; << 776 << 777 xas_lock_irq(&xas); << 778 xas_for_each(&xas, entry, end) { << 779 entry = get_unlocked_entry(&xa << 780 xas_clear_mark(&xas, PAGECACHE << 781 xas_clear_mark(&xas, PAGECACHE << 782 put_unlocked_entry(&xas, entry << 783 << 784 if (++scanned % XA_CHECK_SCHED << 785 continue; << 786 << 787 xas_pause(&xas); << 788 xas_unlock_irq(&xas); << 789 cond_resched(); << 790 xas_lock_irq(&xas); << 791 } << 792 xas_unlock_irq(&xas); << 793 << 794 return 0; << 795 } << 796 << 797 /* 490 /* 798 * Delete DAX entry at @index from @mapping. !! 491 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 799 * to be unlocked before deleting it. !! 492 * entry to get unlocked before deleting it. 800 */ 493 */ 801 int dax_delete_mapping_entry(struct address_sp 494 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 802 { 495 { 803 int ret = __dax_invalidate_entry(mappi !! 496 int ret = __dax_invalidate_mapping_entry(mapping, index, true); 804 497 805 /* 498 /* 806 * This gets called from truncate / pu 499 * This gets called from truncate / punch_hole path. As such, the caller 807 * must hold locks protecting against 500 * must hold locks protecting against concurrent modifications of the 808 * page cache (usually fs-private i_mm !! 501 * radix tree (usually fs-private i_mmap_sem for writing). Since the 809 * caller has seen a DAX entry for thi !! 502 * caller has seen exceptional entry for this index, we better find it 810 * at that index as well... 503 * at that index as well... 811 */ 504 */ 812 WARN_ON_ONCE(!ret); 505 WARN_ON_ONCE(!ret); 813 return ret; 506 return ret; 814 } 507 } 815 508 816 /* 509 /* 817 * Invalidate DAX entry if it is clean. !! 510 * Invalidate exceptional DAX entry if it is clean. 818 */ 511 */ 819 int dax_invalidate_mapping_entry_sync(struct a 512 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 820 pgoff_t 513 pgoff_t index) 821 { 514 { 822 return __dax_invalidate_entry(mapping, !! 515 return __dax_invalidate_mapping_entry(mapping, index, false); 823 } 516 } 824 517 825 static pgoff_t dax_iomap_pgoff(const struct io !! 518 /* >> 519 * The user has performed a load from a hole in the file. Allocating >> 520 * a new page in the file would cause excessive storage usage for >> 521 * workloads with sparse files. We allocate a page cache page instead. >> 522 * We'll kick it out of the page cache if it's ever written to, >> 523 * otherwise it will simply fall out of the page cache under memory >> 524 * pressure without ever having been dirtied. >> 525 */ >> 526 static int dax_load_hole(struct address_space *mapping, void **entry, >> 527 struct vm_fault *vmf) 826 { 528 { 827 return PHYS_PFN(iomap->addr + (pos & P !! 529 struct page *page; 828 } !! 530 int ret; 829 531 830 static int copy_cow_page_dax(struct vm_fault * !! 532 /* Hole page already exists? Return it... */ 831 { !! 533 if (!radix_tree_exceptional_entry(*entry)) { 832 pgoff_t pgoff = dax_iomap_pgoff(&iter- !! 534 page = *entry; 833 void *vto, *kaddr; !! 535 goto out; 834 long rc; !! 536 } 835 int id; << 836 537 837 id = dax_read_lock(); !! 538 /* This will replace locked radix tree entry with a hole page */ 838 rc = dax_direct_access(iter->iomap.dax !! 539 page = find_or_create_page(mapping, vmf->pgoff, 839 &kaddr, NULL); !! 540 vmf->gfp_mask | __GFP_ZERO); 840 if (rc < 0) { !! 541 if (!page) 841 dax_read_unlock(id); !! 542 return VM_FAULT_OOM; 842 return rc; !! 543 out: >> 544 vmf->page = page; >> 545 ret = finish_fault(vmf); >> 546 vmf->page = NULL; >> 547 *entry = page; >> 548 if (!ret) { >> 549 /* Grab reference for PTE that is now referencing the page */ >> 550 get_page(page); >> 551 return VM_FAULT_NOPAGE; 843 } 552 } 844 vto = kmap_atomic(vmf->cow_page); !! 553 return ret; 845 copy_user_page(vto, kaddr, vmf->addres << 846 kunmap_atomic(vto); << 847 dax_read_unlock(id); << 848 return 0; << 849 } 554 } 850 555 851 /* !! 556 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 852 * MAP_SYNC on a dax mapping guarantees dirty !! 557 struct page *to, unsigned long vaddr) 853 * flushed on write-faults (non-cow), but not << 854 */ << 855 static bool dax_fault_is_synchronous(const str << 856 struct vm_area_struct *vma) << 857 { 558 { 858 return (iter->flags & IOMAP_WRITE) && !! 559 struct blk_dax_ctl dax = { 859 (iter->iomap.flags & IOMAP_F_D !! 560 .sector = sector, >> 561 .size = size, >> 562 }; >> 563 void *vto; >> 564 >> 565 if (dax_map_atomic(bdev, &dax) < 0) >> 566 return PTR_ERR(dax.addr); >> 567 vto = kmap_atomic(to); >> 568 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); >> 569 kunmap_atomic(vto); >> 570 dax_unmap_atomic(bdev, &dax); >> 571 return 0; 860 } 572 } 861 573 862 /* 574 /* 863 * By this point grab_mapping_entry() has ensu 575 * By this point grab_mapping_entry() has ensured that we have a locked entry 864 * of the appropriate size so we don't have to 576 * of the appropriate size so we don't have to worry about downgrading PMDs to 865 * PTEs. If we happen to be trying to insert 577 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 866 * already in the tree, we will skip the inser 578 * already in the tree, we will skip the insertion and just dirty the PMD as 867 * appropriate. 579 * appropriate. 868 */ 580 */ 869 static void *dax_insert_entry(struct xa_state !! 581 static void *dax_insert_mapping_entry(struct address_space *mapping, 870 const struct iomap_iter *iter, !! 582 struct vm_fault *vmf, 871 unsigned long flags) !! 583 void *entry, sector_t sector, >> 584 unsigned long flags) 872 { 585 { 873 struct address_space *mapping = vmf->v !! 586 struct radix_tree_root *page_tree = &mapping->page_tree; 874 void *new_entry = dax_make_entry(pfn, !! 587 int error = 0; 875 bool write = iter->flags & IOMAP_WRITE !! 588 bool hole_fill = false; 876 bool dirty = write && !dax_fault_is_sy !! 589 void *new_entry; 877 bool shared = iter->iomap.flags & IOMA !! 590 pgoff_t index = vmf->pgoff; 878 591 879 if (dirty) !! 592 if (vmf->flags & FAULT_FLAG_WRITE) 880 __mark_inode_dirty(mapping->ho 593 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 881 594 882 if (shared || (dax_is_zero_entry(entry !! 595 /* Replacing hole page with block mapping? */ 883 unsigned long index = xas->xa_ !! 596 if (!radix_tree_exceptional_entry(entry)) { 884 /* we are replacing a zero pag !! 597 hole_fill = true; 885 if (dax_is_pmd_entry(entry)) << 886 unmap_mapping_pages(ma << 887 PG_PMD << 888 else /* pte entry */ << 889 unmap_mapping_pages(ma << 890 } << 891 << 892 xas_reset(xas); << 893 xas_lock_irq(xas); << 894 if (shared || dax_is_zero_entry(entry) << 895 void *old; << 896 << 897 dax_disassociate_entry(entry, << 898 dax_associate_entry(new_entry, << 899 shared); << 900 /* 598 /* 901 * Only swap our new entry int !! 599 * Unmap the page now before we remove it from page cache below. >> 600 * The page is locked so it cannot be faulted in again. >> 601 */ >> 602 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, >> 603 PAGE_SIZE, 0); >> 604 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); >> 605 if (error) >> 606 return ERR_PTR(error); >> 607 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { >> 608 /* replacing huge zero page with PMD block mapping */ >> 609 unmap_mapping_range(mapping, >> 610 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); >> 611 } >> 612 >> 613 spin_lock_irq(&mapping->tree_lock); >> 614 new_entry = dax_radix_locked_entry(sector, flags); >> 615 >> 616 if (hole_fill) { >> 617 __delete_from_page_cache(entry, NULL); >> 618 /* Drop pagecache reference */ >> 619 put_page(entry); >> 620 error = __radix_tree_insert(page_tree, index, >> 621 dax_radix_order(new_entry), new_entry); >> 622 if (error) { >> 623 new_entry = ERR_PTR(error); >> 624 goto unlock; >> 625 } >> 626 mapping->nrexceptional++; >> 627 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { >> 628 /* >> 629 * Only swap our new entry into the radix tree if the current 902 * entry is a zero page or an 630 * entry is a zero page or an empty entry. If a normal PTE or 903 * PMD entry is already in the !! 631 * PMD entry is already in the tree, we leave it alone. This 904 * means that if we are trying 632 * means that if we are trying to insert a PTE and the 905 * existing entry is a PMD, we 633 * existing entry is a PMD, we will just leave the PMD in the 906 * tree and dirty it if necess 634 * tree and dirty it if necessary. 907 */ 635 */ 908 old = dax_lock_entry(xas, new_ !! 636 struct radix_tree_node *node; 909 WARN_ON_ONCE(old != xa_mk_valu !! 637 void **slot; 910 DAX_LO !! 638 void *ret; 911 entry = new_entry; !! 639 912 } else { !! 640 ret = __radix_tree_lookup(page_tree, index, &node, &slot); 913 xas_load(xas); /* Walk the xa !! 641 WARN_ON_ONCE(ret != entry); >> 642 __radix_tree_replace(page_tree, node, slot, >> 643 new_entry, NULL, NULL); 914 } 644 } >> 645 if (vmf->flags & FAULT_FLAG_WRITE) >> 646 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); >> 647 unlock: >> 648 spin_unlock_irq(&mapping->tree_lock); >> 649 if (hole_fill) { >> 650 radix_tree_preload_end(); >> 651 /* >> 652 * We don't need hole page anymore, it has been replaced with >> 653 * locked radix tree entry now. >> 654 */ >> 655 if (mapping->a_ops->freepage) >> 656 mapping->a_ops->freepage(entry); >> 657 unlock_page(entry); >> 658 put_page(entry); >> 659 } >> 660 return new_entry; >> 661 } 915 662 916 if (dirty) !! 663 static inline unsigned long 917 xas_set_mark(xas, PAGECACHE_TA !! 664 pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) 918 !! 665 { 919 if (write && shared) !! 666 unsigned long address; 920 xas_set_mark(xas, PAGECACHE_TA << 921 667 922 xas_unlock_irq(xas); !! 668 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 923 return entry; !! 669 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); >> 670 return address; 924 } 671 } 925 672 926 static int dax_writeback_one(struct xa_state * !! 673 /* Walk all mappings of a given index of a file and writeprotect them */ 927 struct address_space *mapping, !! 674 static void dax_mapping_entry_mkclean(struct address_space *mapping, >> 675 pgoff_t index, unsigned long pfn) 928 { 676 { 929 unsigned long pfn, index, count, end; << 930 long ret = 0; << 931 struct vm_area_struct *vma; 677 struct vm_area_struct *vma; >> 678 pte_t pte, *ptep = NULL; >> 679 pmd_t *pmdp = NULL; >> 680 spinlock_t *ptl; >> 681 bool changed; >> 682 >> 683 i_mmap_lock_read(mapping); >> 684 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { >> 685 unsigned long address; >> 686 >> 687 cond_resched(); >> 688 >> 689 if (!(vma->vm_flags & VM_SHARED)) >> 690 continue; >> 691 >> 692 address = pgoff_address(index, vma); >> 693 changed = false; >> 694 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) >> 695 continue; >> 696 >> 697 if (pmdp) { >> 698 #ifdef CONFIG_FS_DAX_PMD >> 699 pmd_t pmd; >> 700 >> 701 if (pfn != pmd_pfn(*pmdp)) >> 702 goto unlock_pmd; >> 703 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) >> 704 goto unlock_pmd; >> 705 >> 706 flush_cache_page(vma, address, pfn); >> 707 pmd = pmdp_huge_clear_flush(vma, address, pmdp); >> 708 pmd = pmd_wrprotect(pmd); >> 709 pmd = pmd_mkclean(pmd); >> 710 set_pmd_at(vma->vm_mm, address, pmdp, pmd); >> 711 changed = true; >> 712 unlock_pmd: >> 713 spin_unlock(ptl); >> 714 #endif >> 715 } else { >> 716 if (pfn != pte_pfn(*ptep)) >> 717 goto unlock_pte; >> 718 if (!pte_dirty(*ptep) && !pte_write(*ptep)) >> 719 goto unlock_pte; >> 720 >> 721 flush_cache_page(vma, address, pfn); >> 722 pte = ptep_clear_flush(vma, address, ptep); >> 723 pte = pte_wrprotect(pte); >> 724 pte = pte_mkclean(pte); >> 725 set_pte_at(vma->vm_mm, address, ptep, pte); >> 726 changed = true; >> 727 unlock_pte: >> 728 pte_unmap_unlock(ptep, ptl); >> 729 } >> 730 >> 731 if (changed) >> 732 mmu_notifier_invalidate_page(vma->vm_mm, address); >> 733 } >> 734 i_mmap_unlock_read(mapping); >> 735 } >> 736 >> 737 static int dax_writeback_one(struct block_device *bdev, >> 738 struct address_space *mapping, pgoff_t index, void *entry) >> 739 { >> 740 struct radix_tree_root *page_tree = &mapping->page_tree; >> 741 struct blk_dax_ctl dax; >> 742 void *entry2, **slot; >> 743 int ret = 0; 932 744 933 /* 745 /* 934 * A page got tagged dirty in DAX mapp 746 * A page got tagged dirty in DAX mapping? Something is seriously 935 * wrong. 747 * wrong. 936 */ 748 */ 937 if (WARN_ON(!xa_is_value(entry))) !! 749 if (WARN_ON(!radix_tree_exceptional_entry(entry))) 938 return -EIO; 750 return -EIO; 939 751 940 if (unlikely(dax_is_locked(entry))) { !! 752 spin_lock_irq(&mapping->tree_lock); 941 void *old_entry = entry; !! 753 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 942 !! 754 /* Entry got punched out / reallocated? */ 943 entry = get_unlocked_entry(xas !! 755 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 944 !! 756 goto put_unlocked; 945 /* Entry got punched out / rea !! 757 /* 946 if (!entry || WARN_ON_ONCE(!xa !! 758 * Entry got reallocated elsewhere? No need to writeback. We have to 947 goto put_unlocked; !! 759 * compare sectors as we must not bail out due to difference in lockbit 948 /* !! 760 * or entry type. 949 * Entry got reallocated elsew !! 761 */ 950 * We have to compare pfns as !! 762 if (dax_radix_sector(entry2) != dax_radix_sector(entry)) 951 * difference in lockbit or en !! 763 goto put_unlocked; 952 */ !! 764 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 953 if (dax_to_pfn(old_entry) != d !! 765 dax_is_zero_entry(entry))) { 954 goto put_unlocked; !! 766 ret = -EIO; 955 if (WARN_ON_ONCE(dax_is_empty_ !! 767 goto put_unlocked; 956 dax_is << 957 ret = -EIO; << 958 goto put_unlocked; << 959 } << 960 << 961 /* Another fsync thread may ha << 962 if (!xas_get_mark(xas, PAGECAC << 963 goto put_unlocked; << 964 } 768 } 965 769 >> 770 /* Another fsync thread may have already written back this entry */ >> 771 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) >> 772 goto put_unlocked; 966 /* Lock the entry to serialize with pa 773 /* Lock the entry to serialize with page faults */ 967 dax_lock_entry(xas, entry); !! 774 entry = lock_slot(mapping, slot); 968 << 969 /* 775 /* 970 * We can clear the tag now but we hav 776 * We can clear the tag now but we have to be careful so that concurrent 971 * dax_writeback_one() calls for the s 777 * dax_writeback_one() calls for the same index cannot finish before we 972 * actually flush the caches. This is 778 * actually flush the caches. This is achieved as the calls will look 973 * at the entry only under the i_pages !! 779 * at the entry only under tree_lock and once they do that they will 974 * they will see the entry locked and !! 780 * see the entry locked and wait for it to unlock. 975 */ 781 */ 976 xas_clear_mark(xas, PAGECACHE_TAG_TOWR !! 782 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 977 xas_unlock_irq(xas); !! 783 spin_unlock_irq(&mapping->tree_lock); 978 784 979 /* 785 /* 980 * If dax_writeback_mapping_range() wa !! 786 * Even if dax_writeback_mapping_range() was given a wbc->range_start 981 * in the middle of a PMD, the 'index' !! 787 * in the middle of a PMD, the 'index' we are given will be aligned to 982 * aligned to the start of the PMD. !! 788 * the start index of the PMD, as will the sector we pull from 983 * This allows us to flush for PMD_SIZ !! 789 * 'entry'. This allows us to flush for PMD_SIZE and not have to 984 * partial PMD writebacks. !! 790 * worry about partial PMD writebacks. 985 */ 791 */ 986 pfn = dax_to_pfn(entry); !! 792 dax.sector = dax_radix_sector(entry); 987 count = 1UL << dax_entry_order(entry); !! 793 dax.size = PAGE_SIZE << dax_radix_order(entry); 988 index = xas->xa_index & ~(count - 1); << 989 end = index + count - 1; << 990 794 991 /* Walk all mappings of a given index !! 795 /* 992 i_mmap_lock_read(mapping); !! 796 * We cannot hold tree_lock while calling dax_map_atomic() because it 993 vma_interval_tree_foreach(vma, &mappin !! 797 * eventually calls cond_resched(). 994 pfn_mkclean_range(pfn, count, !! 798 */ 995 cond_resched(); !! 799 ret = dax_map_atomic(bdev, &dax); >> 800 if (ret < 0) { >> 801 put_locked_mapping_entry(mapping, index, entry); >> 802 return ret; >> 803 } >> 804 >> 805 if (WARN_ON_ONCE(ret < dax.size)) { >> 806 ret = -EIO; >> 807 goto unmap; 996 } 808 } 997 i_mmap_unlock_read(mapping); << 998 809 999 dax_flush(dax_dev, page_address(pfn_to !! 810 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); >> 811 wb_cache_pmem(dax.addr, dax.size); 1000 /* 812 /* 1001 * After we have flushed the cache, w 813 * After we have flushed the cache, we can clear the dirty tag. There 1002 * cannot be new dirty data in the pf 814 * cannot be new dirty data in the pfn after the flush has completed as 1003 * the pfn mappings are writeprotecte 815 * the pfn mappings are writeprotected and fault waits for mapping 1004 * entry lock. 816 * entry lock. 1005 */ 817 */ 1006 xas_reset(xas); !! 818 spin_lock_irq(&mapping->tree_lock); 1007 xas_lock_irq(xas); !! 819 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 1008 xas_store(xas, entry); !! 820 spin_unlock_irq(&mapping->tree_lock); 1009 xas_clear_mark(xas, PAGECACHE_TAG_DIR !! 821 unmap: 1010 dax_wake_entry(xas, entry, WAKE_NEXT) !! 822 dax_unmap_atomic(bdev, &dax); 1011 !! 823 put_locked_mapping_entry(mapping, index, entry); 1012 trace_dax_writeback_one(mapping->host << 1013 return ret; 824 return ret; 1014 825 1015 put_unlocked: 826 put_unlocked: 1016 put_unlocked_entry(xas, entry, WAKE_N !! 827 put_unlocked_mapping_entry(mapping, index, entry2); >> 828 spin_unlock_irq(&mapping->tree_lock); 1017 return ret; 829 return ret; 1018 } 830 } 1019 831 1020 /* 832 /* 1021 * Flush the mapping to the persistent domain 833 * Flush the mapping to the persistent domain within the byte range of [start, 1022 * end]. This is required by data integrity o 834 * end]. This is required by data integrity operations to ensure file data is 1023 * on persistent storage prior to completion 835 * on persistent storage prior to completion of the operation. 1024 */ 836 */ 1025 int dax_writeback_mapping_range(struct addres 837 int dax_writeback_mapping_range(struct address_space *mapping, 1026 struct dax_device *dax_dev, s !! 838 struct block_device *bdev, struct writeback_control *wbc) 1027 { 839 { 1028 XA_STATE(xas, &mapping->i_pages, wbc- << 1029 struct inode *inode = mapping->host; 840 struct inode *inode = mapping->host; 1030 pgoff_t end_index = wbc->range_end >> !! 841 pgoff_t start_index, end_index; 1031 void *entry; !! 842 pgoff_t indices[PAGEVEC_SIZE]; 1032 int ret = 0; !! 843 struct pagevec pvec; 1033 unsigned int scanned = 0; !! 844 bool done = false; >> 845 int i, ret = 0; 1034 846 1035 if (WARN_ON_ONCE(inode->i_blkbits != 847 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 1036 return -EIO; 848 return -EIO; 1037 849 1038 if (mapping_empty(mapping) || wbc->sy !! 850 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 1039 return 0; 851 return 0; 1040 852 1041 trace_dax_writeback_range(inode, xas. !! 853 start_index = wbc->range_start >> PAGE_SHIFT; 1042 !! 854 end_index = wbc->range_end >> PAGE_SHIFT; 1043 tag_pages_for_writeback(mapping, xas. << 1044 << 1045 xas_lock_irq(&xas); << 1046 xas_for_each_marked(&xas, entry, end_ << 1047 ret = dax_writeback_one(&xas, << 1048 if (ret < 0) { << 1049 mapping_set_error(map << 1050 break; << 1051 } << 1052 if (++scanned % XA_CHECK_SCHE << 1053 continue; << 1054 << 1055 xas_pause(&xas); << 1056 xas_unlock_irq(&xas); << 1057 cond_resched(); << 1058 xas_lock_irq(&xas); << 1059 } << 1060 xas_unlock_irq(&xas); << 1061 trace_dax_writeback_range_done(inode, << 1062 return ret; << 1063 } << 1064 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range << 1065 << 1066 static int dax_iomap_direct_access(const stru << 1067 size_t size, void **kaddr, pf << 1068 { << 1069 pgoff_t pgoff = dax_iomap_pgoff(iomap << 1070 int id, rc = 0; << 1071 long length; << 1072 << 1073 id = dax_read_lock(); << 1074 length = dax_direct_access(iomap->dax << 1075 DAX_ACCESS << 1076 if (length < 0) { << 1077 rc = length; << 1078 goto out; << 1079 } << 1080 if (!pfnp) << 1081 goto out_check_addr; << 1082 rc = -EINVAL; << 1083 if (PFN_PHYS(length) < size) << 1084 goto out; << 1085 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(s << 1086 goto out; << 1087 /* For larger pages we need devmap */ << 1088 if (length > 1 && !pfn_t_devmap(*pfnp << 1089 goto out; << 1090 rc = 0; << 1091 << 1092 out_check_addr: << 1093 if (!kaddr) << 1094 goto out; << 1095 if (!*kaddr) << 1096 rc = -EFAULT; << 1097 out: << 1098 dax_read_unlock(id); << 1099 return rc; << 1100 } << 1101 855 1102 /** !! 856 tag_pages_for_writeback(mapping, start_index, end_index); 1103 * dax_iomap_copy_around - Prepare for an una << 1104 * by copying the data before and after the r << 1105 * @pos: address to do copy from. << 1106 * @length: size of copy operation. << 1107 * @align_size: aligned w.r.t align_size (eit << 1108 * @srcmap: iomap srcmap << 1109 * @daddr: destination address to copy t << 1110 * << 1111 * This can be called from two places. Either << 1112 * aligned), to copy the length size data to << 1113 * write operation, dax_iomap_iter() might ca << 1114 * start or end unaligned address. In the lat << 1115 * aligned ranges is taken care by dax_iomap_ << 1116 * If the srcmap contains invalid data, such << 1117 * area to make sure no old data remains. << 1118 */ << 1119 static int dax_iomap_copy_around(loff_t pos, << 1120 const struct iomap *srcmap, v << 1121 { << 1122 loff_t head_off = pos & (align_size - << 1123 size_t size = ALIGN(head_off + length << 1124 loff_t end = pos + length; << 1125 loff_t pg_end = round_up(end, align_s << 1126 /* copy_all is usually in page fault << 1127 bool copy_all = head_off == 0 && end << 1128 /* zero the edges if srcmap is a HOLE << 1129 bool zero_edge = srcmap->flags & IOMA << 1130 srcmap->type == IOMA << 1131 void *saddr = NULL; << 1132 int ret = 0; << 1133 857 1134 if (!zero_edge) { !! 858 pagevec_init(&pvec, 0); 1135 ret = dax_iomap_direct_access !! 859 while (!done) { 1136 if (ret) !! 860 pvec.nr = find_get_entries_tag(mapping, start_index, 1137 return dax_mem2blk_er !! 861 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 1138 } !! 862 pvec.pages, indices); 1139 863 1140 if (copy_all) { !! 864 if (pvec.nr == 0) 1141 if (zero_edge) !! 865 break; 1142 memset(daddr, 0, size << 1143 else << 1144 ret = copy_mc_to_kern << 1145 goto out; << 1146 } << 1147 866 1148 /* Copy the head part of the range */ !! 867 for (i = 0; i < pvec.nr; i++) { 1149 if (head_off) { !! 868 if (indices[i] > end_index) { 1150 if (zero_edge) !! 869 done = true; 1151 memset(daddr, 0, head !! 870 break; 1152 else { !! 871 } 1153 ret = copy_mc_to_kern << 1154 if (ret) << 1155 return -EIO; << 1156 } << 1157 } << 1158 872 1159 /* Copy the tail part of the range */ !! 873 ret = dax_writeback_one(bdev, mapping, indices[i], 1160 if (end < pg_end) { !! 874 pvec.pages[i]); 1161 loff_t tail_off = head_off + !! 875 if (ret < 0) 1162 loff_t tail_len = pg_end - en !! 876 return ret; 1163 << 1164 if (zero_edge) << 1165 memset(daddr + tail_o << 1166 else { << 1167 ret = copy_mc_to_kern << 1168 << 1169 if (ret) << 1170 return -EIO; << 1171 } 877 } >> 878 start_index = indices[pvec.nr - 1] + 1; 1172 } 879 } 1173 out: !! 880 return 0; 1174 if (zero_edge) << 1175 dax_flush(srcmap->dax_dev, da << 1176 return ret ? -EIO : 0; << 1177 } 881 } >> 882 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 1178 883 1179 /* !! 884 static int dax_insert_mapping(struct address_space *mapping, 1180 * The user has performed a load from a hole !! 885 struct block_device *bdev, sector_t sector, size_t size, 1181 * page in the file would cause excessive sto !! 886 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 1182 * sparse files. Instead we insert a read-on << 1183 * If this page is ever written to we will re << 1184 * point to real DAX storage instead. << 1185 */ << 1186 static vm_fault_t dax_load_hole(struct xa_sta << 1187 const struct iomap_iter *iter << 1188 { 887 { 1189 struct inode *inode = iter->inode; << 1190 unsigned long vaddr = vmf->address; 888 unsigned long vaddr = vmf->address; 1191 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn( !! 889 struct blk_dax_ctl dax = { 1192 vm_fault_t ret; !! 890 .sector = sector, >> 891 .size = size, >> 892 }; >> 893 void *ret; >> 894 void *entry = *entryp; 1193 895 1194 *entry = dax_insert_entry(xas, vmf, i !! 896 if (dax_map_atomic(bdev, &dax) < 0) >> 897 return PTR_ERR(dax.addr); >> 898 dax_unmap_atomic(bdev, &dax); >> 899 >> 900 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); >> 901 if (IS_ERR(ret)) >> 902 return PTR_ERR(ret); >> 903 *entryp = ret; 1195 904 1196 ret = vmf_insert_mixed(vmf->vma, vadd !! 905 return vm_insert_mixed(vma, vaddr, dax.pfn); 1197 trace_dax_load_hole(inode, vmf, ret); << 1198 return ret; << 1199 } 906 } 1200 907 1201 #ifdef CONFIG_FS_DAX_PMD !! 908 /** 1202 static vm_fault_t dax_pmd_load_hole(struct xa !! 909 * dax_pfn_mkwrite - handle first write to DAX page 1203 const struct iomap_iter *iter !! 910 * @vmf: The description of the fault >> 911 */ >> 912 int dax_pfn_mkwrite(struct vm_fault *vmf) 1204 { 913 { 1205 struct address_space *mapping = vmf-> !! 914 struct file *file = vmf->vma->vm_file; 1206 unsigned long pmd_addr = vmf->address !! 915 struct address_space *mapping = file->f_mapping; 1207 struct vm_area_struct *vma = vmf->vma !! 916 void *entry, **slot; 1208 struct inode *inode = mapping->host; !! 917 pgoff_t index = vmf->pgoff; 1209 pgtable_t pgtable = NULL; !! 918 1210 struct folio *zero_folio; !! 919 spin_lock_irq(&mapping->tree_lock); 1211 spinlock_t *ptl; !! 920 entry = get_unlocked_mapping_entry(mapping, index, &slot); 1212 pmd_t pmd_entry; !! 921 if (!entry || !radix_tree_exceptional_entry(entry)) { 1213 pfn_t pfn; !! 922 if (entry) 1214 !! 923 put_unlocked_mapping_entry(mapping, index, entry); 1215 zero_folio = mm_get_huge_zero_folio(v !! 924 spin_unlock_irq(&mapping->tree_lock); 1216 !! 925 return VM_FAULT_NOPAGE; 1217 if (unlikely(!zero_folio)) << 1218 goto fallback; << 1219 << 1220 pfn = page_to_pfn_t(&zero_folio->page << 1221 *entry = dax_insert_entry(xas, vmf, i << 1222 DAX_PMD | D << 1223 << 1224 if (arch_needs_pgtable_deposit()) { << 1225 pgtable = pte_alloc_one(vma-> << 1226 if (!pgtable) << 1227 return VM_FAULT_OOM; << 1228 } << 1229 << 1230 ptl = pmd_lock(vmf->vma->vm_mm, vmf-> << 1231 if (!pmd_none(*(vmf->pmd))) { << 1232 spin_unlock(ptl); << 1233 goto fallback; << 1234 } << 1235 << 1236 if (pgtable) { << 1237 pgtable_trans_huge_deposit(vm << 1238 mm_inc_nr_ptes(vma->vm_mm); << 1239 } 926 } 1240 pmd_entry = mk_pmd(&zero_folio->page, !! 927 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 1241 pmd_entry = pmd_mkhuge(pmd_entry); !! 928 entry = lock_slot(mapping, slot); 1242 set_pmd_at(vmf->vma->vm_mm, pmd_addr, !! 929 spin_unlock_irq(&mapping->tree_lock); 1243 spin_unlock(ptl); << 1244 trace_dax_pmd_load_hole(inode, vmf, z << 1245 return VM_FAULT_NOPAGE; << 1246 << 1247 fallback: << 1248 if (pgtable) << 1249 pte_free(vma->vm_mm, pgtable) << 1250 trace_dax_pmd_load_hole_fallback(inod << 1251 return VM_FAULT_FALLBACK; << 1252 } << 1253 #else << 1254 static vm_fault_t dax_pmd_load_hole(struct xa << 1255 const struct iomap_iter *iter << 1256 { << 1257 return VM_FAULT_FALLBACK; << 1258 } << 1259 #endif /* CONFIG_FS_DAX_PMD */ << 1260 << 1261 static s64 dax_unshare_iter(struct iomap_iter << 1262 { << 1263 struct iomap *iomap = &iter->iomap; << 1264 const struct iomap *srcmap = iomap_it << 1265 loff_t copy_pos = iter->pos; << 1266 u64 copy_len = iomap_length(iter); << 1267 u32 mod; << 1268 int id = 0; << 1269 s64 ret = 0; << 1270 void *daddr = NULL, *saddr = NULL; << 1271 << 1272 if (!iomap_want_unshare_iter(iter)) << 1273 return iomap_length(iter); << 1274 << 1275 /* 930 /* 1276 * Extend the file range to be aligne !! 931 * If we race with somebody updating the PTE and finish_mkwrite_fault() 1277 * we need to copy entire blocks, not !! 932 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 1278 * Invalidate the mapping because we' !! 933 * the fault in either case. 1279 */ 934 */ 1280 mod = offset_in_page(copy_pos); !! 935 finish_mkwrite_fault(vmf); 1281 if (mod) { !! 936 put_locked_mapping_entry(mapping, index, entry); 1282 copy_len += mod; !! 937 return VM_FAULT_NOPAGE; 1283 copy_pos -= mod; << 1284 } << 1285 << 1286 mod = offset_in_page(copy_pos + copy_ << 1287 if (mod) << 1288 copy_len += PAGE_SIZE - mod; << 1289 << 1290 invalidate_inode_pages2_range(iter->i << 1291 copy_po << 1292 (copy_p << 1293 << 1294 id = dax_read_lock(); << 1295 ret = dax_iomap_direct_access(iomap, << 1296 if (ret < 0) << 1297 goto out_unlock; << 1298 << 1299 ret = dax_iomap_direct_access(srcmap, << 1300 if (ret < 0) << 1301 goto out_unlock; << 1302 << 1303 if (copy_mc_to_kernel(daddr, saddr, c << 1304 ret = iomap_length(iter); << 1305 else << 1306 ret = -EIO; << 1307 << 1308 out_unlock: << 1309 dax_read_unlock(id); << 1310 return dax_mem2blk_err(ret); << 1311 } << 1312 << 1313 int dax_file_unshare(struct inode *inode, lof << 1314 const struct iomap_ops *ops) << 1315 { << 1316 struct iomap_iter iter = { << 1317 .inode = inode, << 1318 .pos = pos, << 1319 .flags = IOMAP_WRITE << 1320 }; << 1321 loff_t size = i_size_read(inode); << 1322 int ret; << 1323 << 1324 if (pos < 0 || pos >= size) << 1325 return 0; << 1326 << 1327 iter.len = min(len, size - pos); << 1328 while ((ret = iomap_iter(&iter, ops)) << 1329 iter.processed = dax_unshare_ << 1330 return ret; << 1331 } << 1332 EXPORT_SYMBOL_GPL(dax_file_unshare); << 1333 << 1334 static int dax_memzero(struct iomap_iter *ite << 1335 { << 1336 const struct iomap *iomap = &iter->io << 1337 const struct iomap *srcmap = iomap_it << 1338 unsigned offset = offset_in_page(pos) << 1339 pgoff_t pgoff = dax_iomap_pgoff(iomap << 1340 void *kaddr; << 1341 long ret; << 1342 << 1343 ret = dax_direct_access(iomap->dax_de << 1344 NULL); << 1345 if (ret < 0) << 1346 return dax_mem2blk_err(ret); << 1347 << 1348 memset(kaddr + offset, 0, size); << 1349 if (iomap->flags & IOMAP_F_SHARED) << 1350 ret = dax_iomap_copy_around(p << 1351 k << 1352 else << 1353 dax_flush(iomap->dax_dev, kad << 1354 return ret; << 1355 } 938 } >> 939 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1356 940 1357 static s64 dax_zero_iter(struct iomap_iter *i !! 941 static bool dax_range_is_aligned(struct block_device *bdev, >> 942 unsigned int offset, unsigned int length) 1358 { 943 { 1359 const struct iomap *iomap = &iter->io !! 944 unsigned short sector_size = bdev_logical_block_size(bdev); 1360 const struct iomap *srcmap = iomap_it << 1361 loff_t pos = iter->pos; << 1362 u64 length = iomap_length(iter); << 1363 s64 written = 0; << 1364 << 1365 /* already zeroed? we're done. */ << 1366 if (srcmap->type == IOMAP_HOLE || src << 1367 return length; << 1368 945 1369 /* !! 946 if (!IS_ALIGNED(offset, sector_size)) 1370 * invalidate the pages whose sharing !! 947 return false; 1371 * because of CoW. !! 948 if (!IS_ALIGNED(length, sector_size)) 1372 */ !! 949 return false; 1373 if (iomap->flags & IOMAP_F_SHARED) << 1374 invalidate_inode_pages2_range << 1375 << 1376 << 1377 << 1378 do { << 1379 unsigned offset = offset_in_p << 1380 unsigned size = min_t(u64, PA << 1381 pgoff_t pgoff = dax_iomap_pgo << 1382 long rc; << 1383 int id; << 1384 << 1385 id = dax_read_lock(); << 1386 if (IS_ALIGNED(pos, PAGE_SIZE << 1387 rc = dax_zero_page_ra << 1388 else << 1389 rc = dax_memzero(iter << 1390 dax_read_unlock(id); << 1391 950 1392 if (rc < 0) !! 951 return true; 1393 return rc; << 1394 pos += size; << 1395 length -= size; << 1396 written += size; << 1397 } while (length > 0); << 1398 << 1399 if (did_zero) << 1400 *did_zero = true; << 1401 return written; << 1402 } 952 } 1403 953 1404 int dax_zero_range(struct inode *inode, loff_ !! 954 int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 1405 const struct iomap_ops *ops) !! 955 unsigned int offset, unsigned int length) 1406 { 956 { 1407 struct iomap_iter iter = { !! 957 struct blk_dax_ctl dax = { 1408 .inode = inode, !! 958 .sector = sector, 1409 .pos = pos, !! 959 .size = PAGE_SIZE, 1410 .len = len, << 1411 .flags = IOMAP_DAX | << 1412 }; 960 }; 1413 int ret; << 1414 961 1415 while ((ret = iomap_iter(&iter, ops)) !! 962 if (dax_range_is_aligned(bdev, offset, length)) { 1416 iter.processed = dax_zero_ite !! 963 sector_t start_sector = dax.sector + (offset >> 9); 1417 return ret; !! 964 >> 965 return blkdev_issue_zeroout(bdev, start_sector, >> 966 length >> 9, GFP_NOFS, true); >> 967 } else { >> 968 if (dax_map_atomic(bdev, &dax) < 0) >> 969 return PTR_ERR(dax.addr); >> 970 clear_pmem(dax.addr + offset, length); >> 971 dax_unmap_atomic(bdev, &dax); >> 972 } >> 973 return 0; 1418 } 974 } 1419 EXPORT_SYMBOL_GPL(dax_zero_range); !! 975 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1420 976 1421 int dax_truncate_page(struct inode *inode, lo !! 977 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 1422 const struct iomap_ops *ops) << 1423 { 978 { 1424 unsigned int blocksize = i_blocksize( !! 979 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 1425 unsigned int off = pos & (blocksize - << 1426 << 1427 /* Block boundary? Nothing to do */ << 1428 if (!off) << 1429 return 0; << 1430 return dax_zero_range(inode, pos, blo << 1431 } 980 } 1432 EXPORT_SYMBOL_GPL(dax_truncate_page); << 1433 981 1434 static loff_t dax_iomap_iter(const struct iom !! 982 static loff_t 1435 struct iov_iter *iter) !! 983 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, >> 984 struct iomap *iomap) 1436 { 985 { 1437 const struct iomap *iomap = &iomi->io !! 986 struct iov_iter *iter = data; 1438 const struct iomap *srcmap = iomap_it << 1439 loff_t length = iomap_length(iomi); << 1440 loff_t pos = iomi->pos; << 1441 struct dax_device *dax_dev = iomap->d << 1442 loff_t end = pos + length, done = 0; 987 loff_t end = pos + length, done = 0; 1443 bool write = iov_iter_rw(iter) == WRI << 1444 bool cow = write && iomap->flags & IO << 1445 ssize_t ret = 0; 988 ssize_t ret = 0; 1446 size_t xfer; << 1447 int id; << 1448 989 1449 if (!write) { !! 990 if (iov_iter_rw(iter) == READ) { 1450 end = min(end, i_size_read(io !! 991 end = min(end, i_size_read(inode)); 1451 if (pos >= end) 992 if (pos >= end) 1452 return 0; 993 return 0; 1453 994 1454 if (iomap->type == IOMAP_HOLE 995 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1455 return iov_iter_zero( 996 return iov_iter_zero(min(length, end - pos), iter); 1456 } 997 } 1457 998 1458 /* !! 999 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1459 * In DAX mode, enforce either pure o << 1460 * writes to unwritten extents as par << 1461 */ << 1462 if (WARN_ON_ONCE(iomap->type != IOMAP << 1463 !(iomap->flags & IOMA << 1464 return -EIO; 1000 return -EIO; 1465 1001 1466 /* 1002 /* 1467 * Write can allocate block for an ar 1003 * Write can allocate block for an area which has a hole page mapped 1468 * into page tables. We have to tear 1004 * into page tables. We have to tear down these mappings so that data 1469 * written by write(2) is visible in 1005 * written by write(2) is visible in mmap. 1470 */ 1006 */ 1471 if (iomap->flags & IOMAP_F_NEW || cow !! 1007 if (iomap->flags & IOMAP_F_NEW) { 1472 /* !! 1008 invalidate_inode_pages2_range(inode->i_mapping, 1473 * Filesystem allows CoW on n << 1474 * may have been mmapped with << 1475 * invalidate its dax entries << 1476 * in advance. << 1477 */ << 1478 if (cow) << 1479 __dax_clear_dirty_ran << 1480 << 1481 << 1482 invalidate_inode_pages2_range << 1483 1009 pos >> PAGE_SHIFT, 1484 1010 (end - 1) >> PAGE_SHIFT); 1485 } 1011 } 1486 1012 1487 id = dax_read_lock(); << 1488 while (pos < end) { 1013 while (pos < end) { 1489 unsigned offset = pos & (PAGE 1014 unsigned offset = pos & (PAGE_SIZE - 1); 1490 const size_t size = ALIGN(len !! 1015 struct blk_dax_ctl dax = { 0 }; 1491 pgoff_t pgoff = dax_iomap_pgo << 1492 ssize_t map_len; 1016 ssize_t map_len; 1493 bool recovery = false; << 1494 void *kaddr; << 1495 1017 1496 if (fatal_signal_pending(curr 1018 if (fatal_signal_pending(current)) { 1497 ret = -EINTR; 1019 ret = -EINTR; 1498 break; 1020 break; 1499 } 1021 } 1500 1022 1501 map_len = dax_direct_access(d !! 1023 dax.sector = dax_iomap_sector(iomap, pos); 1502 DAX_ACCESS, & !! 1024 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1503 if (map_len == -EHWPOISON && !! 1025 map_len = dax_map_atomic(iomap->bdev, &dax); 1504 map_len = dax_direct_ << 1505 PHYS_ << 1506 &kadd << 1507 if (map_len > 0) << 1508 recovery = tr << 1509 } << 1510 if (map_len < 0) { 1026 if (map_len < 0) { 1511 ret = dax_mem2blk_err !! 1027 ret = map_len; 1512 break; 1028 break; 1513 } 1029 } 1514 1030 1515 if (cow) { !! 1031 dax.addr += offset; 1516 ret = dax_iomap_copy_ << 1517 << 1518 if (ret) << 1519 break; << 1520 } << 1521 << 1522 map_len = PFN_PHYS(map_len); << 1523 kaddr += offset; << 1524 map_len -= offset; 1032 map_len -= offset; 1525 if (map_len > end - pos) 1033 if (map_len > end - pos) 1526 map_len = end - pos; 1034 map_len = end - pos; 1527 1035 1528 if (recovery) !! 1036 if (iov_iter_rw(iter) == WRITE) 1529 xfer = dax_recovery_w !! 1037 map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1530 map_l << 1531 else if (write) << 1532 xfer = dax_copy_from_ << 1533 map_l << 1534 else 1038 else 1535 xfer = dax_copy_to_it !! 1039 map_len = copy_to_iter(dax.addr, map_len, iter); 1536 map_l !! 1040 dax_unmap_atomic(iomap->bdev, &dax); 1537 !! 1041 if (map_len <= 0) { 1538 pos += xfer; !! 1042 ret = map_len ? map_len : -EFAULT; 1539 length -= xfer; << 1540 done += xfer; << 1541 << 1542 if (xfer == 0) << 1543 ret = -EFAULT; << 1544 if (xfer < map_len) << 1545 break; 1043 break; >> 1044 } >> 1045 >> 1046 pos += map_len; >> 1047 length -= map_len; >> 1048 done += map_len; 1546 } 1049 } 1547 dax_read_unlock(id); << 1548 1050 1549 return done ? done : ret; 1051 return done ? done : ret; 1550 } 1052 } 1551 1053 1552 /** 1054 /** 1553 * dax_iomap_rw - Perform I/O to a DAX file 1055 * dax_iomap_rw - Perform I/O to a DAX file 1554 * @iocb: The control block for this I/ 1056 * @iocb: The control block for this I/O 1555 * @iter: The addresses to do I/O from 1057 * @iter: The addresses to do I/O from or to 1556 * @ops: iomap ops passed from the fil 1058 * @ops: iomap ops passed from the file system 1557 * 1059 * 1558 * This function performs read and write oper 1060 * This function performs read and write operations to directly mapped 1559 * persistent memory. The callers needs to t 1061 * persistent memory. The callers needs to take care of read/write exclusion 1560 * and evicting any page cache pages in the r 1062 * and evicting any page cache pages in the region under I/O. 1561 */ 1063 */ 1562 ssize_t 1064 ssize_t 1563 dax_iomap_rw(struct kiocb *iocb, struct iov_i 1065 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1564 const struct iomap_ops *ops) 1066 const struct iomap_ops *ops) 1565 { 1067 { 1566 struct iomap_iter iomi = { !! 1068 struct address_space *mapping = iocb->ki_filp->f_mapping; 1567 .inode = iocb->ki_fi !! 1069 struct inode *inode = mapping->host; 1568 .pos = iocb->ki_po !! 1070 loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1569 .len = iov_iter_co !! 1071 unsigned flags = 0; 1570 .flags = IOMAP_DAX, << 1571 }; << 1572 loff_t done = 0; << 1573 int ret; << 1574 << 1575 if (!iomi.len) << 1576 return 0; << 1577 1072 1578 if (iov_iter_rw(iter) == WRITE) { 1073 if (iov_iter_rw(iter) == WRITE) { 1579 lockdep_assert_held_write(&io !! 1074 lockdep_assert_held_exclusive(&inode->i_rwsem); 1580 iomi.flags |= IOMAP_WRITE; !! 1075 flags |= IOMAP_WRITE; 1581 } else { 1076 } else { 1582 lockdep_assert_held(&iomi.ino !! 1077 lockdep_assert_held(&inode->i_rwsem); 1583 } 1078 } 1584 1079 1585 if (iocb->ki_flags & IOCB_NOWAIT) !! 1080 while (iov_iter_count(iter)) { 1586 iomi.flags |= IOMAP_NOWAIT; !! 1081 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1587 !! 1082 iter, dax_iomap_actor); 1588 while ((ret = iomap_iter(&iomi, ops)) !! 1083 if (ret <= 0) 1589 iomi.processed = dax_iomap_it !! 1084 break; >> 1085 pos += ret; >> 1086 done += ret; >> 1087 } 1590 1088 1591 done = iomi.pos - iocb->ki_pos; !! 1089 iocb->ki_pos += done; 1592 iocb->ki_pos = iomi.pos; << 1593 return done ? done : ret; 1090 return done ? done : ret; 1594 } 1091 } 1595 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1092 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1596 1093 1597 static vm_fault_t dax_fault_return(int error) !! 1094 static int dax_fault_return(int error) 1598 { 1095 { 1599 if (error == 0) 1096 if (error == 0) 1600 return VM_FAULT_NOPAGE; 1097 return VM_FAULT_NOPAGE; 1601 return vmf_error(error); !! 1098 if (error == -ENOMEM) 1602 } !! 1099 return VM_FAULT_OOM; 1603 !! 1100 return VM_FAULT_SIGBUS; 1604 /* << 1605 * When handling a synchronous page fault and << 1606 * insert the PTE/PMD into page tables only a << 1607 * insertion for now and return the pfn so th << 1608 * fsync is done. << 1609 */ << 1610 static vm_fault_t dax_fault_synchronous_pfnp( << 1611 { << 1612 if (WARN_ON_ONCE(!pfnp)) << 1613 return VM_FAULT_SIGBUS; << 1614 *pfnp = pfn; << 1615 return VM_FAULT_NEEDDSYNC; << 1616 } 1101 } 1617 1102 1618 static vm_fault_t dax_fault_cow_page(struct v !! 1103 static int dax_iomap_pte_fault(struct vm_fault *vmf, 1619 const struct iomap_iter *iter !! 1104 const struct iomap_ops *ops) 1620 { << 1621 vm_fault_t ret; << 1622 int error = 0; << 1623 << 1624 switch (iter->iomap.type) { << 1625 case IOMAP_HOLE: << 1626 case IOMAP_UNWRITTEN: << 1627 clear_user_highpage(vmf->cow_ << 1628 break; << 1629 case IOMAP_MAPPED: << 1630 error = copy_cow_page_dax(vmf << 1631 break; << 1632 default: << 1633 WARN_ON_ONCE(1); << 1634 error = -EIO; << 1635 break; << 1636 } << 1637 << 1638 if (error) << 1639 return dax_fault_return(error << 1640 << 1641 __SetPageUptodate(vmf->cow_page); << 1642 ret = finish_fault(vmf); << 1643 if (!ret) << 1644 return VM_FAULT_DONE_COW; << 1645 return ret; << 1646 } << 1647 << 1648 /** << 1649 * dax_fault_iter - Common actor to handle pf << 1650 * @vmf: vm fault instance << 1651 * @iter: iomap iter << 1652 * @pfnp: pfn to be returned << 1653 * @xas: the dax mapping tree of a fil << 1654 * @entry: an unlocked dax entry to be i << 1655 * @pmd: distinguish whether it is a p << 1656 */ << 1657 static vm_fault_t dax_fault_iter(struct vm_fa << 1658 const struct iomap_iter *iter << 1659 struct xa_state *xas, void ** << 1660 { << 1661 const struct iomap *iomap = &iter->io << 1662 const struct iomap *srcmap = iomap_it << 1663 size_t size = pmd ? PMD_SIZE : PAGE_S << 1664 loff_t pos = (loff_t)xas->xa_index << << 1665 bool write = iter->flags & IOMAP_WRIT << 1666 unsigned long entry_flags = pmd ? DAX << 1667 int err = 0; << 1668 pfn_t pfn; << 1669 void *kaddr; << 1670 << 1671 if (!pmd && vmf->cow_page) << 1672 return dax_fault_cow_page(vmf << 1673 << 1674 /* if we are reading UNWRITTEN and HO << 1675 if (!write && << 1676 (iomap->type == IOMAP_UNWRITTEN | << 1677 if (!pmd) << 1678 return dax_load_hole( << 1679 return dax_pmd_load_hole(xas, << 1680 } << 1681 << 1682 if (iomap->type != IOMAP_MAPPED && !( << 1683 WARN_ON_ONCE(1); << 1684 return pmd ? VM_FAULT_FALLBAC << 1685 } << 1686 << 1687 err = dax_iomap_direct_access(iomap, << 1688 if (err) << 1689 return pmd ? VM_FAULT_FALLBAC << 1690 << 1691 *entry = dax_insert_entry(xas, vmf, i << 1692 << 1693 if (write && iomap->flags & IOMAP_F_S << 1694 err = dax_iomap_copy_around(p << 1695 if (err) << 1696 return dax_fault_retu << 1697 } << 1698 << 1699 if (dax_fault_is_synchronous(iter, vm << 1700 return dax_fault_synchronous_ << 1701 << 1702 /* insert PMD pfn */ << 1703 if (pmd) << 1704 return vmf_insert_pfn_pmd(vmf << 1705 << 1706 /* insert PTE pfn */ << 1707 if (write) << 1708 return vmf_insert_mixed_mkwri << 1709 return vmf_insert_mixed(vmf->vma, vmf << 1710 } << 1711 << 1712 static vm_fault_t dax_iomap_pte_fault(struct << 1713 int *iomap_err << 1714 { 1105 { 1715 struct address_space *mapping = vmf-> 1106 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1716 XA_STATE(xas, &mapping->i_pages, vmf- !! 1107 struct inode *inode = mapping->host; 1717 struct iomap_iter iter = { !! 1108 unsigned long vaddr = vmf->address; 1718 .inode = mapping->ho !! 1109 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1719 .pos = (loff_t)vmf !! 1110 sector_t sector; 1720 .len = PAGE_SIZE, !! 1111 struct iomap iomap = { 0 }; 1721 .flags = IOMAP_DAX | !! 1112 unsigned flags = IOMAP_FAULT; 1722 }; !! 1113 int error, major = 0; 1723 vm_fault_t ret = 0; !! 1114 int vmf_ret = 0; 1724 void *entry; 1115 void *entry; 1725 int error; << 1726 1116 1727 trace_dax_pte_fault(iter.inode, vmf, << 1728 /* 1117 /* 1729 * Check whether offset isn't beyond 1118 * Check whether offset isn't beyond end of file now. Caller is supposed 1730 * to hold locks serializing us with 1119 * to hold locks serializing us with truncate / punch hole so this is 1731 * a reliable test. 1120 * a reliable test. 1732 */ 1121 */ 1733 if (iter.pos >= i_size_read(iter.inod !! 1122 if (pos >= i_size_read(inode)) 1734 ret = VM_FAULT_SIGBUS; !! 1123 return VM_FAULT_SIGBUS; 1735 goto out; << 1736 } << 1737 1124 1738 if ((vmf->flags & FAULT_FLAG_WRITE) & 1125 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1739 iter.flags |= IOMAP_WRITE; !! 1126 flags |= IOMAP_WRITE; 1740 1127 1741 entry = grab_mapping_entry(&xas, mapp !! 1128 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1742 if (xa_is_internal(entry)) { !! 1129 if (IS_ERR(entry)) 1743 ret = xa_to_internal(entry); !! 1130 return dax_fault_return(PTR_ERR(entry)); 1744 goto out; << 1745 } << 1746 1131 1747 /* 1132 /* 1748 * It is possible, particularly with 1133 * It is possible, particularly with mixed reads & writes to private 1749 * mappings, that we have raced with 1134 * mappings, that we have raced with a PMD fault that overlaps with 1750 * the PTE we need to set up. If so 1135 * the PTE we need to set up. If so just return and the fault will be 1751 * retried. 1136 * retried. 1752 */ 1137 */ 1753 if (pmd_trans_huge(*vmf->pmd) || pmd_ 1138 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { 1754 ret = VM_FAULT_NOPAGE; !! 1139 vmf_ret = VM_FAULT_NOPAGE; 1755 goto unlock_entry; 1140 goto unlock_entry; 1756 } 1141 } 1757 1142 1758 while ((error = iomap_iter(&iter, ops !! 1143 /* 1759 if (WARN_ON_ONCE(iomap_length !! 1144 * Note that we don't bother to use iomap_apply here: DAX required 1760 iter.processed = -EIO !! 1145 * the file system block size to be equal the page size, which means 1761 continue; !! 1146 * that we never have to deal with more than a single extent here. >> 1147 */ >> 1148 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); >> 1149 if (error) { >> 1150 vmf_ret = dax_fault_return(error); >> 1151 goto unlock_entry; >> 1152 } >> 1153 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { >> 1154 error = -EIO; /* fs corruption? */ >> 1155 goto error_finish_iomap; >> 1156 } >> 1157 >> 1158 sector = dax_iomap_sector(&iomap, pos); >> 1159 >> 1160 if (vmf->cow_page) { >> 1161 switch (iomap.type) { >> 1162 case IOMAP_HOLE: >> 1163 case IOMAP_UNWRITTEN: >> 1164 clear_user_highpage(vmf->cow_page, vaddr); >> 1165 break; >> 1166 case IOMAP_MAPPED: >> 1167 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, >> 1168 vmf->cow_page, vaddr); >> 1169 break; >> 1170 default: >> 1171 WARN_ON_ONCE(1); >> 1172 error = -EIO; >> 1173 break; 1762 } 1174 } 1763 1175 1764 ret = dax_fault_iter(vmf, &it !! 1176 if (error) 1765 if (ret != VM_FAULT_SIGBUS && !! 1177 goto error_finish_iomap; 1766 (iter.iomap.flags & IOMAP !! 1178 >> 1179 __SetPageUptodate(vmf->cow_page); >> 1180 vmf_ret = finish_fault(vmf); >> 1181 if (!vmf_ret) >> 1182 vmf_ret = VM_FAULT_DONE_COW; >> 1183 goto finish_iomap; >> 1184 } >> 1185 >> 1186 switch (iomap.type) { >> 1187 case IOMAP_MAPPED: >> 1188 if (iomap.flags & IOMAP_F_NEW) { 1767 count_vm_event(PGMAJF 1189 count_vm_event(PGMAJFAULT); 1768 count_memcg_event_mm( !! 1190 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); 1769 ret |= VM_FAULT_MAJOR !! 1191 major = VM_FAULT_MAJOR; 1770 } 1192 } 1771 !! 1193 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1772 if (!(ret & VM_FAULT_ERROR)) !! 1194 PAGE_SIZE, &entry, vmf->vma, vmf); 1773 iter.processed = PAGE !! 1195 /* -EBUSY is fine, somebody else faulted on the same PTE */ >> 1196 if (error == -EBUSY) >> 1197 error = 0; >> 1198 break; >> 1199 case IOMAP_UNWRITTEN: >> 1200 case IOMAP_HOLE: >> 1201 if (!(vmf->flags & FAULT_FLAG_WRITE)) { >> 1202 vmf_ret = dax_load_hole(mapping, &entry, vmf); >> 1203 goto finish_iomap; >> 1204 } >> 1205 /*FALLTHRU*/ >> 1206 default: >> 1207 WARN_ON_ONCE(1); >> 1208 error = -EIO; >> 1209 break; 1774 } 1210 } 1775 1211 1776 if (iomap_errp) !! 1212 error_finish_iomap: 1777 *iomap_errp = error; !! 1213 vmf_ret = dax_fault_return(error) | major; 1778 if (!ret && error) !! 1214 finish_iomap: 1779 ret = dax_fault_return(error) !! 1215 if (ops->iomap_end) { >> 1216 int copied = PAGE_SIZE; 1780 1217 1781 unlock_entry: !! 1218 if (vmf_ret & VM_FAULT_ERROR) 1782 dax_unlock_entry(&xas, entry); !! 1219 copied = 0; 1783 out: !! 1220 /* 1784 trace_dax_pte_fault_done(iter.inode, !! 1221 * The fault is done by now and there's no way back (other 1785 return ret; !! 1222 * thread may be already happily using PTE we have installed). >> 1223 * Just ignore error from ->iomap_end since we cannot do much >> 1224 * with it. >> 1225 */ >> 1226 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); >> 1227 } >> 1228 unlock_entry: >> 1229 put_locked_mapping_entry(mapping, vmf->pgoff, entry); >> 1230 return vmf_ret; 1786 } 1231 } 1787 1232 1788 #ifdef CONFIG_FS_DAX_PMD 1233 #ifdef CONFIG_FS_DAX_PMD 1789 static bool dax_fault_check_fallback(struct v !! 1234 /* 1790 pgoff_t max_pgoff) !! 1235 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up >> 1236 * more often than one might expect in the below functions. >> 1237 */ >> 1238 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) >> 1239 >> 1240 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, >> 1241 loff_t pos, void **entryp) >> 1242 { >> 1243 struct address_space *mapping = vmf->vma->vm_file->f_mapping; >> 1244 struct block_device *bdev = iomap->bdev; >> 1245 struct inode *inode = mapping->host; >> 1246 struct blk_dax_ctl dax = { >> 1247 .sector = dax_iomap_sector(iomap, pos), >> 1248 .size = PMD_SIZE, >> 1249 }; >> 1250 long length = dax_map_atomic(bdev, &dax); >> 1251 void *ret = NULL; >> 1252 >> 1253 if (length < 0) /* dax_map_atomic() failed */ >> 1254 goto fallback; >> 1255 if (length < PMD_SIZE) >> 1256 goto unmap_fallback; >> 1257 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) >> 1258 goto unmap_fallback; >> 1259 if (!pfn_t_devmap(dax.pfn)) >> 1260 goto unmap_fallback; >> 1261 >> 1262 dax_unmap_atomic(bdev, &dax); >> 1263 >> 1264 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, >> 1265 RADIX_DAX_PMD); >> 1266 if (IS_ERR(ret)) >> 1267 goto fallback; >> 1268 *entryp = ret; >> 1269 >> 1270 trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); >> 1271 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, >> 1272 dax.pfn, vmf->flags & FAULT_FLAG_WRITE); >> 1273 >> 1274 unmap_fallback: >> 1275 dax_unmap_atomic(bdev, &dax); >> 1276 fallback: >> 1277 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, >> 1278 dax.pfn, ret); >> 1279 return VM_FAULT_FALLBACK; >> 1280 } >> 1281 >> 1282 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, >> 1283 void **entryp) 1791 { 1284 { >> 1285 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1792 unsigned long pmd_addr = vmf->address 1286 unsigned long pmd_addr = vmf->address & PMD_MASK; 1793 bool write = vmf->flags & FAULT_FLAG_ !! 1287 struct inode *inode = mapping->host; >> 1288 struct page *zero_page; >> 1289 void *ret = NULL; >> 1290 spinlock_t *ptl; >> 1291 pmd_t pmd_entry; 1794 1292 1795 /* !! 1293 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 1796 * Make sure that the faulting addres << 1797 * the PMD offset from the start of t << 1798 * that a PMD range in the page table << 1799 * range in the page cache. << 1800 */ << 1801 if ((vmf->pgoff & PG_PMD_COLOUR) != << 1802 ((vmf->address >> PAGE_SHIFT) & P << 1803 return true; << 1804 1294 1805 /* Fall back to PTEs if we're going t !! 1295 if (unlikely(!zero_page)) 1806 if (write && !(vmf->vma->vm_flags & V !! 1296 goto fallback; 1807 return true; << 1808 1297 1809 /* If the PMD would extend outside th !! 1298 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1810 if (pmd_addr < vmf->vma->vm_start) !! 1299 RADIX_DAX_PMD | RADIX_DAX_HZP); 1811 return true; !! 1300 if (IS_ERR(ret)) 1812 if ((pmd_addr + PMD_SIZE) > vmf->vma- !! 1301 goto fallback; 1813 return true; !! 1302 *entryp = ret; 1814 1303 1815 /* If the PMD would extend beyond the !! 1304 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1816 if ((xas->xa_index | PG_PMD_COLOUR) > !! 1305 if (!pmd_none(*(vmf->pmd))) { 1817 return true; !! 1306 spin_unlock(ptl); >> 1307 goto fallback; >> 1308 } >> 1309 >> 1310 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); >> 1311 pmd_entry = pmd_mkhuge(pmd_entry); >> 1312 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); >> 1313 spin_unlock(ptl); >> 1314 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); >> 1315 return VM_FAULT_NOPAGE; 1818 1316 1819 return false; !! 1317 fallback: >> 1318 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); >> 1319 return VM_FAULT_FALLBACK; 1820 } 1320 } 1821 1321 1822 static vm_fault_t dax_iomap_pmd_fault(struct !! 1322 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1823 const struct i 1323 const struct iomap_ops *ops) 1824 { 1324 { 1825 struct address_space *mapping = vmf-> !! 1325 struct vm_area_struct *vma = vmf->vma; 1826 XA_STATE_ORDER(xas, &mapping->i_pages !! 1326 struct address_space *mapping = vma->vm_file->f_mapping; 1827 struct iomap_iter iter = { !! 1327 unsigned long pmd_addr = vmf->address & PMD_MASK; 1828 .inode = mapping->ho !! 1328 bool write = vmf->flags & FAULT_FLAG_WRITE; 1829 .len = PMD_SIZE, !! 1329 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1830 .flags = IOMAP_DAX | !! 1330 struct inode *inode = mapping->host; 1831 }; !! 1331 int result = VM_FAULT_FALLBACK; 1832 vm_fault_t ret = VM_FAULT_FALLBACK; !! 1332 struct iomap iomap = { 0 }; 1833 pgoff_t max_pgoff; !! 1333 pgoff_t max_pgoff, pgoff; 1834 void *entry; 1334 void *entry; 1835 !! 1335 loff_t pos; 1836 if (vmf->flags & FAULT_FLAG_WRITE) !! 1336 int error; 1837 iter.flags |= IOMAP_WRITE; << 1838 1337 1839 /* 1338 /* 1840 * Check whether offset isn't beyond 1339 * Check whether offset isn't beyond end of file now. Caller is 1841 * supposed to hold locks serializing 1340 * supposed to hold locks serializing us with truncate / punch hole so 1842 * this is a reliable test. 1341 * this is a reliable test. 1843 */ 1342 */ 1844 max_pgoff = DIV_ROUND_UP(i_size_read( !! 1343 pgoff = linear_page_index(vma, pmd_addr); >> 1344 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1845 1345 1846 trace_dax_pmd_fault(iter.inode, vmf, !! 1346 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); >> 1347 >> 1348 /* Fall back to PTEs if we're going to COW */ >> 1349 if (write && !(vma->vm_flags & VM_SHARED)) >> 1350 goto fallback; 1847 1351 1848 if (xas.xa_index >= max_pgoff) { !! 1352 /* If the PMD would extend outside the VMA */ 1849 ret = VM_FAULT_SIGBUS; !! 1353 if (pmd_addr < vma->vm_start) >> 1354 goto fallback; >> 1355 if ((pmd_addr + PMD_SIZE) > vma->vm_end) >> 1356 goto fallback; >> 1357 >> 1358 if (pgoff > max_pgoff) { >> 1359 result = VM_FAULT_SIGBUS; 1850 goto out; 1360 goto out; 1851 } 1361 } 1852 1362 1853 if (dax_fault_check_fallback(vmf, &xa !! 1363 /* If the PMD would extend beyond the file size */ >> 1364 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1854 goto fallback; 1365 goto fallback; 1855 1366 1856 /* 1367 /* 1857 * grab_mapping_entry() will make sur !! 1368 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1858 * a zero PMD entry or a DAX PMD. If !! 1369 * PMD or a HZP entry. If it can't (because a 4k page is already in 1859 * entry is already in the array, for !! 1370 * the tree, for instance), it will return -EEXIST and we just fall 1860 * VM_FAULT_FALLBACK. !! 1371 * back to 4k entries. 1861 */ 1372 */ 1862 entry = grab_mapping_entry(&xas, mapp !! 1373 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1863 if (xa_is_internal(entry)) { !! 1374 if (IS_ERR(entry)) 1864 ret = xa_to_internal(entry); << 1865 goto fallback; 1375 goto fallback; 1866 } << 1867 1376 1868 /* 1377 /* 1869 * It is possible, particularly with 1378 * It is possible, particularly with mixed reads & writes to private 1870 * mappings, that we have raced with 1379 * mappings, that we have raced with a PTE fault that overlaps with 1871 * the PMD we need to set up. If so 1380 * the PMD we need to set up. If so just return and the fault will be 1872 * retried. 1381 * retried. 1873 */ 1382 */ 1874 if (!pmd_none(*vmf->pmd) && !pmd_tran 1383 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && 1875 !pmd_devmap(*vmf->pmd 1384 !pmd_devmap(*vmf->pmd)) { 1876 ret = 0; !! 1385 result = 0; 1877 goto unlock_entry; 1386 goto unlock_entry; 1878 } 1387 } 1879 1388 1880 iter.pos = (loff_t)xas.xa_index << PA !! 1389 /* 1881 while (iomap_iter(&iter, ops) > 0) { !! 1390 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1882 if (iomap_length(&iter) < PMD !! 1391 * setting up a mapping, so really we're using iomap_begin() as a way 1883 continue; /* actually !! 1392 * to look up our filesystem block. 1884 !! 1393 */ 1885 ret = dax_fault_iter(vmf, &it !! 1394 pos = (loff_t)pgoff << PAGE_SHIFT; 1886 if (ret != VM_FAULT_FALLBACK) !! 1395 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1887 iter.processed = PMD_ !! 1396 if (error) >> 1397 goto unlock_entry; >> 1398 >> 1399 if (iomap.offset + iomap.length < pos + PMD_SIZE) >> 1400 goto finish_iomap; >> 1401 >> 1402 switch (iomap.type) { >> 1403 case IOMAP_MAPPED: >> 1404 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); >> 1405 break; >> 1406 case IOMAP_UNWRITTEN: >> 1407 case IOMAP_HOLE: >> 1408 if (WARN_ON_ONCE(write)) >> 1409 break; >> 1410 result = dax_pmd_load_hole(vmf, &iomap, &entry); >> 1411 break; >> 1412 default: >> 1413 WARN_ON_ONCE(1); >> 1414 break; 1888 } 1415 } 1889 1416 1890 unlock_entry: !! 1417 finish_iomap: 1891 dax_unlock_entry(&xas, entry); !! 1418 if (ops->iomap_end) { 1892 fallback: !! 1419 int copied = PMD_SIZE; 1893 if (ret == VM_FAULT_FALLBACK) { !! 1420 1894 split_huge_pmd(vmf->vma, vmf- !! 1421 if (result == VM_FAULT_FALLBACK) >> 1422 copied = 0; >> 1423 /* >> 1424 * The fault is done by now and there's no way back (other >> 1425 * thread may be already happily using PMD we have installed). >> 1426 * Just ignore error from ->iomap_end since we cannot do much >> 1427 * with it. >> 1428 */ >> 1429 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, >> 1430 &iomap); >> 1431 } >> 1432 unlock_entry: >> 1433 put_locked_mapping_entry(mapping, pgoff, entry); >> 1434 fallback: >> 1435 if (result == VM_FAULT_FALLBACK) { >> 1436 split_huge_pmd(vma, vmf->pmd, vmf->address); 1895 count_vm_event(THP_FAULT_FALL 1437 count_vm_event(THP_FAULT_FALLBACK); 1896 } 1438 } 1897 out: 1439 out: 1898 trace_dax_pmd_fault_done(iter.inode, !! 1440 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); 1899 return ret; !! 1441 return result; 1900 } 1442 } 1901 #else 1443 #else 1902 static vm_fault_t dax_iomap_pmd_fault(struct !! 1444 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1903 const struct i 1445 const struct iomap_ops *ops) 1904 { 1446 { 1905 return VM_FAULT_FALLBACK; 1447 return VM_FAULT_FALLBACK; 1906 } 1448 } 1907 #endif /* CONFIG_FS_DAX_PMD */ 1449 #endif /* CONFIG_FS_DAX_PMD */ 1908 1450 1909 /** 1451 /** 1910 * dax_iomap_fault - handle a page fault on a 1452 * dax_iomap_fault - handle a page fault on a DAX file 1911 * @vmf: The description of the fault 1453 * @vmf: The description of the fault 1912 * @order: Order of the page to fault in !! 1454 * @ops: iomap ops passed from the file system 1913 * @pfnp: PFN to insert for synchronous fault << 1914 * @iomap_errp: Storage for detailed error co << 1915 * @ops: Iomap ops passed from the file syste << 1916 * 1455 * 1917 * When a page fault occurs, filesystems may 1456 * When a page fault occurs, filesystems may call this helper in 1918 * their fault handler for DAX files. dax_iom 1457 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1919 * has done all the necessary locking for pag 1458 * has done all the necessary locking for page fault to proceed 1920 * successfully. 1459 * successfully. 1921 */ 1460 */ 1922 vm_fault_t dax_iomap_fault(struct vm_fault *v !! 1461 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1923 pfn_t *pfnp, int *iomap_e !! 1462 const struct iomap_ops *ops) 1924 { 1463 { 1925 if (order == 0) !! 1464 switch (pe_size) { 1926 return dax_iomap_pte_fault(vm !! 1465 case PE_SIZE_PTE: 1927 else if (order == PMD_ORDER) !! 1466 return dax_iomap_pte_fault(vmf, ops); 1928 return dax_iomap_pmd_fault(vm !! 1467 case PE_SIZE_PMD: 1929 else !! 1468 return dax_iomap_pmd_fault(vmf, ops); >> 1469 default: 1930 return VM_FAULT_FALLBACK; 1470 return VM_FAULT_FALLBACK; 1931 } << 1932 EXPORT_SYMBOL_GPL(dax_iomap_fault); << 1933 << 1934 /* << 1935 * dax_insert_pfn_mkwrite - insert PTE or PMD << 1936 * @vmf: The description of the fault << 1937 * @pfn: PFN to insert << 1938 * @order: Order of entry to insert. << 1939 * << 1940 * This function inserts a writeable PTE or P << 1941 * for an mmaped DAX file. It also marks the << 1942 */ << 1943 static vm_fault_t << 1944 dax_insert_pfn_mkwrite(struct vm_fault *vmf, << 1945 { << 1946 struct address_space *mapping = vmf-> << 1947 XA_STATE_ORDER(xas, &mapping->i_pages << 1948 void *entry; << 1949 vm_fault_t ret; << 1950 << 1951 xas_lock_irq(&xas); << 1952 entry = get_unlocked_entry(&xas, orde << 1953 /* Did we race with someone splitting << 1954 if (!entry || dax_is_conflict(entry) << 1955 (order == 0 && !dax_is_pte_entry( << 1956 put_unlocked_entry(&xas, entr << 1957 xas_unlock_irq(&xas); << 1958 trace_dax_insert_pfn_mkwrite_ << 1959 << 1960 return VM_FAULT_NOPAGE; << 1961 } << 1962 xas_set_mark(&xas, PAGECACHE_TAG_DIRT << 1963 dax_lock_entry(&xas, entry); << 1964 xas_unlock_irq(&xas); << 1965 if (order == 0) << 1966 ret = vmf_insert_mixed_mkwrit << 1967 #ifdef CONFIG_FS_DAX_PMD << 1968 else if (order == PMD_ORDER) << 1969 ret = vmf_insert_pfn_pmd(vmf, << 1970 #endif << 1971 else << 1972 ret = VM_FAULT_FALLBACK; << 1973 dax_unlock_entry(&xas, entry); << 1974 trace_dax_insert_pfn_mkwrite(mapping- << 1975 return ret; << 1976 } << 1977 << 1978 /** << 1979 * dax_finish_sync_fault - finish synchronous << 1980 * @vmf: The description of the fault << 1981 * @order: Order of entry to be inserted << 1982 * @pfn: PFN to insert << 1983 * << 1984 * This function ensures that the file range << 1985 * stored persistently on the media and handl << 1986 * table entry. << 1987 */ << 1988 vm_fault_t dax_finish_sync_fault(struct vm_fa << 1989 pfn_t pfn) << 1990 { << 1991 int err; << 1992 loff_t start = ((loff_t)vmf->pgoff) < << 1993 size_t len = PAGE_SIZE << order; << 1994 << 1995 err = vfs_fsync_range(vmf->vma->vm_fi << 1996 if (err) << 1997 return VM_FAULT_SIGBUS; << 1998 return dax_insert_pfn_mkwrite(vmf, pf << 1999 } << 2000 EXPORT_SYMBOL_GPL(dax_finish_sync_fault); << 2001 << 2002 static loff_t dax_range_compare_iter(struct i << 2003 struct iomap_iter *it_dest, u << 2004 { << 2005 const struct iomap *smap = &it_src->i << 2006 const struct iomap *dmap = &it_dest-> << 2007 loff_t pos1 = it_src->pos, pos2 = it_ << 2008 void *saddr, *daddr; << 2009 int id, ret; << 2010 << 2011 len = min(len, min(smap->length, dmap << 2012 << 2013 if (smap->type == IOMAP_HOLE && dmap- << 2014 *same = true; << 2015 return len; << 2016 } << 2017 << 2018 if (smap->type == IOMAP_HOLE || dmap- << 2019 *same = false; << 2020 return 0; << 2021 } << 2022 << 2023 id = dax_read_lock(); << 2024 ret = dax_iomap_direct_access(smap, p << 2025 &saddr, << 2026 if (ret < 0) << 2027 goto out_unlock; << 2028 << 2029 ret = dax_iomap_direct_access(dmap, p << 2030 &daddr, << 2031 if (ret < 0) << 2032 goto out_unlock; << 2033 << 2034 *same = !memcmp(saddr, daddr, len); << 2035 if (!*same) << 2036 len = 0; << 2037 dax_read_unlock(id); << 2038 return len; << 2039 << 2040 out_unlock: << 2041 dax_read_unlock(id); << 2042 return -EIO; << 2043 } << 2044 << 2045 int dax_dedupe_file_range_compare(struct inod << 2046 struct inode *dst, loff_t dst << 2047 const struct iomap_ops *ops) << 2048 { << 2049 struct iomap_iter src_iter = { << 2050 .inode = src, << 2051 .pos = srcoff, << 2052 .len = len, << 2053 .flags = IOMAP_DAX, << 2054 }; << 2055 struct iomap_iter dst_iter = { << 2056 .inode = dst, << 2057 .pos = dstoff, << 2058 .len = len, << 2059 .flags = IOMAP_DAX, << 2060 }; << 2061 int ret, compared = 0; << 2062 << 2063 while ((ret = iomap_iter(&src_iter, o << 2064 (ret = iomap_iter(&dst_iter, o << 2065 compared = dax_range_compare_ << 2066 min(src_iter. << 2067 if (compared < 0) << 2068 return ret; << 2069 src_iter.processed = dst_iter << 2070 } 1471 } 2071 return ret; << 2072 } 1472 } 2073 !! 1473 EXPORT_SYMBOL_GPL(dax_iomap_fault); 2074 int dax_remap_file_range_prep(struct file *fi << 2075 struct file *fi << 2076 loff_t *len, un << 2077 const struct io << 2078 { << 2079 return __generic_remap_file_range_pre << 2080 << 2081 } << 2082 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); << 2083 1474
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.